Search in sources :

Example 71 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SerDeEncodedDataReader method startReadSplitFromFile.

public void startReadSplitFromFile(FileSplit split, boolean[] splitIncludes, StripeData slice) throws IOException {
    boolean maySplitTheSplit = slice == null;
    ReaderWithOffsets offsetReader = null;
    @SuppressWarnings("rawtypes") RecordReader sourceReader = sourceInputFormat.getRecordReader(split, jobConf, reporter);
    Path path = split.getPath().getFileSystem(daemonConf).makeQualified(split.getPath());
    PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
    try {
        offsetReader = createOffsetReader(sourceReader, partDesc.getTableDesc(), split);
        sourceReader = null;
    } finally {
        if (sourceReader != null) {
            try {
                sourceReader.close();
            } catch (Exception ex) {
                LlapIoImpl.LOG.error("Failed to close source reader", ex);
            }
        }
    }
    maySplitTheSplit = maySplitTheSplit && offsetReader.hasOffsets();
    try {
        StructObjectInspector originalOi = (StructObjectInspector) getOiFromSerDe();
        List<Integer> splitColumnIds = OrcInputFormat.genIncludedColumnsReverse(schema, splitIncludes, false);
        // fileread writes to the writer, which writes to orcWriter, which writes to cacheWriter
        EncodingWriter writer = VectorDeserializeOrcWriter.create(sourceInputFormat, sourceSerDe, parts, daemonConf, jobConf, split.getPath(), originalOi, splitColumnIds, splitIncludes, allocSize, encodeExecutor);
        // TODO: move this into ctor? EW would need to create CacheWriter then
        List<Integer> cwColIds = writer.isOnlyWritingIncludedColumns() ? splitColumnIds : columnIds;
        writer.init(new CacheWriter(bufferManager, cwColIds, splitIncludes, writer.isOnlyWritingIncludedColumns(), bufferFactory, isStopped), daemonConf, split.getPath());
        if (writer instanceof VectorDeserializeOrcWriter) {
            VectorDeserializeOrcWriter asyncWriter = (VectorDeserializeOrcWriter) writer;
            asyncWriter.startAsync(new AsyncCacheDataCallback());
            this.asyncWriters.add(asyncWriter);
        }
        currentFileRead = new FileReaderYieldReturn(offsetReader, split, writer, maySplitTheSplit, targetSliceRowCount);
    } finally {
        // Assignment is the last thing in the try, so if it happen we assume success.
        if (currentFileRead != null)
            return;
        if (offsetReader == null)
            return;
        try {
            offsetReader.close();
        } catch (Exception ex) {
            LlapIoImpl.LOG.error("Failed to close source reader", ex);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RecordReader(org.apache.hadoop.mapred.RecordReader) LineRecordReader(org.apache.hadoop.mapred.LineRecordReader) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 72 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class Utilities method createDummyFileForEmptyTable.

@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyTable(JobConf job, MapWork work, Path hiveScratchDir, String alias) throws Exception {
    TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
    if (tableDesc.isNonNative()) {
        // if it does not need native storage, we can't create an empty file for it.
        return null;
    }
    Properties props = tableDesc.getProperties();
    HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc);
    Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false);
    LOG.info("Changed input file for alias {} to newPath", alias, newPath);
    // update the work
    Map<Path, List<String>> pathToAliases = work.getPathToAliases();
    List<String> newList = new ArrayList<String>(1);
    newList.add(alias);
    pathToAliases.put(newPath, newList);
    work.setPathToAliases(pathToAliases);
    PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
    work.addPathToPartitionInfo(newPath, pDesc);
    return newPath;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Properties(java.util.Properties)

Example 73 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class ExecMapper method configure.

@Override
public void configure(JobConf job) {
    execContext = new ExecMapperContext(job);
    Utilities.tryLoggingClassPaths(job, l4j);
    setDone(false);
    try {
        jc = job;
        execContext.setJc(jc);
        // create map and fetch operators
        MapWork mrwork = Utilities.getMapWork(job);
        for (PartitionDesc part : mrwork.getAliasToPartnInfo().values()) {
            TableDesc tableDesc = part.getTableDesc();
            Utilities.copyJobSecretToTableProperties(tableDesc);
        }
        CompilationOpContext runtimeCtx = new CompilationOpContext();
        if (mrwork.getVectorMode()) {
            mo = new VectorMapOperator(runtimeCtx);
        } else {
            mo = new MapOperator(runtimeCtx);
        }
        mo.setConf(mrwork);
        // initialize map operator
        mo.initialize(job, null);
        mo.setChildren(job);
        l4j.info(mo.dump(0));
        // initialize map local work
        localWork = mrwork.getMapRedLocalWork();
        execContext.setLocalWork(localWork);
        MapredContext.init(true, new JobConf(jc));
        mo.passExecContext(execContext);
        mo.initializeLocalWork(jc);
        mo.initializeMapOperator(jc);
        if (localWork == null) {
            return;
        }
        // The following code is for mapjoin
        // initialize all the dummy ops
        l4j.info("Initializing dummy operator");
        List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
        for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
            dummyOp.passExecContext(execContext);
            dummyOp.initialize(jc, null);
        }
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Map operator initialization failed", e);
        }
    }
}
Also used : MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 74 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class CombineHiveInputFormat method getSplits.

/**
 * Create Hive splits based on CombineFileSplit.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    Path[] paths = getInputPaths(job);
    List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
    List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
    int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM, (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
    // In that case, Executors.newFixedThreadPool will fail.
    if (numThreads > 0) {
        try {
            Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
            for (int i = 0; i < paths.length; i++) {
                if (nonCombinablePathIndices.contains(i)) {
                    nonCombinablePaths.add(paths[i]);
                } else {
                    combinablePaths.add(paths[i]);
                }
            }
        } catch (Exception e) {
            LOG.error("Error checking non-combinable path", e);
            perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
            throw new IOException(e);
        }
    }
    // Store the previous value for the path specification
    String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    LOG.debug("The received input paths are: [{}] against the property {}", oldPaths, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    // Process the normal splits
    if (nonCombinablePaths.size() > 0) {
        FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
        InputSplit[] splits = super.getSplits(job, numSplits);
        for (InputSplit split : splits) {
            result.add(split);
        }
    }
    // Process the combine splits
    if (combinablePaths.size() > 0) {
        FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()]));
        Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
        InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
        for (InputSplit split : splits) {
            result.add(split);
        }
    }
    // if some application depends on the original value being set.
    if (oldPaths != null) {
        job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
    }
    // clear work from ThreadLocal after splits generated in case of thread is reused in pool.
    Utilities.clearWorkMapForConf(job);
    if (result.isEmpty() && paths.length > 0 && job.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
        // If there are no inputs; the Execution engine skips the operator tree.
        // To prevent it from happening; an opaque  ZeroRows input is added here - when needed.
        result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(paths[0]), ZeroRowsInputFormat.class.getName()));
    }
    LOG.info("Number of all splits " + result.size());
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new InputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 75 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class CombineHiveRecordReader method extractSinglePartSpec.

private PartitionDesc extractSinglePartSpec(CombineHiveInputSplit hsplit) throws IOException {
    PartitionDesc part = null;
    Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = new HashMap<>();
    for (Path path : hsplit.getPaths()) {
        PartitionDesc otherPart = HiveFileFormatUtils.getFromPathRecursively(pathToPartInfo, path, cache);
        LOG.debug("Found spec for {} {} from {}", path, otherPart, pathToPartInfo);
        if (part == null) {
            part = otherPart;
        } else if (otherPart != part) {
            // Assume we should have the exact same object.
            // TODO: we could also compare the schema and SerDe, and pass only those to the call
            // instead; most of the time these would be the same and LLAP IO can handle that.
            LOG.warn("Multiple partitions found; not going to pass a part spec to LLAP IO: {" + part.getPartSpec() + "} and {" + otherPart.getPartSpec() + "}");
            return null;
        }
    }
    return part;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)90 Path (org.apache.hadoop.fs.Path)67 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)41 ArrayList (java.util.ArrayList)39 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)27 LinkedHashMap (java.util.LinkedHashMap)24 List (java.util.List)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Map (java.util.Map)18 Properties (java.util.Properties)18 HashMap (java.util.HashMap)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 IOException (java.io.IOException)15 Operator (org.apache.hadoop.hive.ql.exec.Operator)15 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)14 Configuration (org.apache.hadoop.conf.Configuration)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)9