Search in sources :

Example 1 with MapJoinTableContainer

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.

the class MapJoinOperator method completeInitializationOp.

@SuppressWarnings("unchecked")
@Override
protected void completeInitializationOp(Object[] os) throws HiveException {
    if (os.length != 0) {
        Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair = (Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>) os[0];
        boolean spilled = false;
        for (MapJoinTableContainer container : pair.getLeft()) {
            if (container != null) {
                spilled = spilled || container.hasSpill();
            }
        }
        if (spilled) {
            // we can't use the cached table because it has spilled.
            loadHashTable(getExecContext(), MapredContext.get());
        } else {
            if (LOG.isDebugEnabled()) {
                String s = "Using tables from cache: [";
                for (MapJoinTableContainer c : pair.getLeft()) {
                    s += ((c == null) ? "null" : c.getClass().getSimpleName()) + ", ";
                }
                LOG.debug(s + "]");
            }
            // let's use the table from the cache.
            mapJoinTables = pair.getLeft();
            mapJoinTableSerdes = pair.getRight();
        }
        hashTblInitedOnce = true;
    }
    if (this.getExecContext() != null) {
        // reset exec context so that initialization of the map operator happens
        // properly
        this.getExecContext().setLastInputPath(null);
        this.getExecContext().setCurrentInputPath(null);
    }
}
Also used : MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Example 2 with MapJoinTableContainer

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.

the class MapJoinOperator method canSkipJoinProcessing.

// If the loaded hash table is empty, for some conditions we can skip processing the big table rows.
protected boolean canSkipJoinProcessing(ExecMapperContext mapContext) {
    if (!canSkipReload(mapContext)) {
        return false;
    }
    JoinCondDesc[] joinConds = getConf().getConds();
    if (joinConds.length > 0) {
        for (JoinCondDesc joinCond : joinConds) {
            if (joinCond.getType() != JoinDesc.INNER_JOIN) {
                return false;
            }
        }
    } else {
        return false;
    }
    boolean skipJoinProcessing = false;
    for (int idx = 0; idx < mapJoinTables.length; ++idx) {
        if (idx == getConf().getPosBigTable()) {
            continue;
        }
        MapJoinTableContainer mapJoinTable = mapJoinTables[idx];
        if (mapJoinTable.size() == 0) {
            // If any table is empty, an inner join involving the tables should yield 0 rows.
            LOG.info("Hash table number " + idx + " is empty");
            skipJoinProcessing = true;
            break;
        }
    }
    return skipJoinProcessing;
}
Also used : MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc)

Example 3 with MapJoinTableContainer

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.

the class MapJoinOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    this.hconf = hconf;
    unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
    super.initializeOp(hconf);
    int tagLen = conf.getTagLength();
    // On Tez only: The hash map might already be cached in the container we run
    // the task in. On MR: The cache is a no-op.
    String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
    cacheKey = "HASH_MAP_" + this.getOperatorId() + "_container";
    cache = ObjectCacheFactory.getCache(hconf, queryId, false);
    loader = getHashTableLoader(hconf);
    hashMapRowGetters = null;
    mapJoinTables = new MapJoinTableContainer[tagLen];
    mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
    hashTblInitedOnce = false;
    // Reset grace hashjoin context so that there is no state maintained when operator/work is
    // retrieved from object cache
    hybridMapJoinLeftover = false;
    firstSmallTable = null;
    generateMapMetaData();
    final ExecMapperContext mapContext = getExecContext();
    final MapredContext mrContext = MapredContext.get();
    if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
        /*
       * The issue with caching in case of bucket map join is that different tasks
       * process different buckets and if the container is reused to join a different bucket,
       * join results can be incorrect. The cache is keyed on operator id and for bucket map join
       * the operator does not change but data needed is different. For a proper fix, this
       * requires changes in the Tez API with regard to finding bucket id and
       * also ability to schedule tasks to re-use containers that have cached the specific bucket.
       */
        if (isLogDebugEnabled) {
            LOG.debug("This is not bucket map join, so cache");
        }
        Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync(cacheKey, new Callable<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>>() {

            @Override
            public Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> call() throws HiveException {
                return loadHashTable(mapContext, mrContext);
            }
        });
        asyncInitOperations.add(future);
    } else if (!isInputFileChangeSensitive(mapContext)) {
        loadHashTable(mapContext, mrContext);
        hashTblInitedOnce = true;
    }
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Example 4 with MapJoinTableContainer

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.

the class MapJoinOperator method closeOp.

@Override
public void closeOp(boolean abort) throws HiveException {
    boolean spilled = false;
    for (MapJoinTableContainer container : mapJoinTables) {
        if (container != null) {
            spilled = spilled || container.hasSpill();
            container.dumpMetrics();
        }
    }
    // For Hybrid Grace Hash Join, we need to see if there is any spilled data to be processed next
    if (spilled) {
        if (!abort) {
            if (hashMapRowGetters == null) {
                hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
            }
            int numPartitions = 0;
            // Find out number of partitions for each small table (should be same across tables)
            for (byte pos = 0; pos < mapJoinTables.length; pos++) {
                if (pos != conf.getPosBigTable()) {
                    firstSmallTable = (HybridHashTableContainer) mapJoinTables[pos];
                    numPartitions = firstSmallTable.getHashPartitions().length;
                    break;
                }
            }
            assert numPartitions != 0 : "Number of partitions must be greater than 0!";
            if (firstSmallTable.hasSpill()) {
                spilledMapJoinTables = new MapJoinBytesTableContainer[mapJoinTables.length];
                hybridMapJoinLeftover = true;
                // Clear all in-memory partitions first
                for (byte pos = 0; pos < mapJoinTables.length; pos++) {
                    MapJoinTableContainer tableContainer = mapJoinTables[pos];
                    if (tableContainer != null && tableContainer instanceof HybridHashTableContainer) {
                        HybridHashTableContainer hybridHtContainer = (HybridHashTableContainer) tableContainer;
                        hybridHtContainer.dumpStats();
                        HashPartition[] hashPartitions = hybridHtContainer.getHashPartitions();
                        // Clear all in memory partitions first
                        for (int i = 0; i < hashPartitions.length; i++) {
                            if (!hashPartitions[i].isHashMapOnDisk()) {
                                hybridHtContainer.setTotalInMemRowCount(hybridHtContainer.getTotalInMemRowCount() - hashPartitions[i].getHashMapFromMemory().getNumValues());
                                hashPartitions[i].getHashMapFromMemory().clear();
                            }
                        }
                        assert hybridHtContainer.getTotalInMemRowCount() == 0;
                    }
                }
                // Reprocess the spilled data
                for (int i = 0; i < numPartitions; i++) {
                    HashPartition[] hashPartitions = firstSmallTable.getHashPartitions();
                    if (hashPartitions[i].isHashMapOnDisk()) {
                        try {
                            // Re-process spilled data
                            continueProcess(i);
                        } catch (KryoException ke) {
                            LOG.error("Processing the spilled data failed due to Kryo error!");
                            LOG.error("Cleaning up all spilled data!");
                            cleanupGraceHashJoin();
                            throw new HiveException(ke);
                        } catch (Exception e) {
                            throw new HiveException(e);
                        }
                        for (byte pos = 0; pos < order.length; pos++) {
                            if (pos != conf.getPosBigTable())
                                spilledMapJoinTables[pos] = null;
                        }
                    }
                }
            }
        }
        if (isLogInfoEnabled) {
            LOG.info("spilled: " + spilled + " abort: " + abort + ". Clearing spilled partitions.");
        }
        // spilled tables are loaded always (no sharing), so clear it
        clearAllTableContainers();
        cache.remove(cacheKey);
    }
    // in mapreduce case, we need to always clear up as mapreduce doesn't have object registry.
    if ((this.getExecContext() != null) && (this.getExecContext().getLocalWork() != null) && (this.getExecContext().getLocalWork().getInputFileChangeSensitive()) && !(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") && SparkUtilities.isDedicatedCluster(hconf))) {
        if (isLogInfoEnabled) {
            LOG.info("MR: Clearing all map join table containers.");
        }
        clearAllTableContainers();
    }
    this.loader = null;
    super.closeOp(abort);
}
Also used : KryoException(com.esotericsoftware.kryo.KryoException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashPartition(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) HybridHashTableContainer(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) KryoException(com.esotericsoftware.kryo.KryoException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 5 with MapJoinTableContainer

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.

the class HashTableLoader method loadDirectly.

private void loadDirectly(MapJoinTableContainer[] mapJoinTables, String inputFileName) throws Exception {
    MapredLocalWork localWork = context.getLocalWork();
    List<Operator<?>> directWorks = localWork.getDirectFetchOp().get(joinOp);
    if (directWorks == null || directWorks.isEmpty()) {
        return;
    }
    JobConf job = new JobConf(hconf);
    MapredLocalTask localTask = new MapredLocalTask(localWork, job, false);
    HashTableSinkOperator sink = new TemporaryHashSinkOperator(new CompilationOpContext(), desc);
    sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(directWorks));
    for (Operator<?> operator : directWorks) {
        if (operator != null) {
            operator.setChildOperators(Arrays.<Operator<? extends OperatorDesc>>asList(sink));
        }
    }
    localTask.setExecContext(context);
    localTask.startForward(inputFileName);
    MapJoinTableContainer[] tables = sink.getMapJoinTables();
    for (int i = 0; i < sink.getNumParent(); i++) {
        if (sink.getParentOperators().get(i) != null) {
            mapJoinTables[i] = tables[i];
        }
    }
    Arrays.fill(tables, null);
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) HashTableSinkOperator(org.apache.hadoop.hive.ql.exec.HashTableSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TemporaryHashSinkOperator(org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator) HashTableSinkOperator(org.apache.hadoop.hive.ql.exec.HashTableSinkOperator) TemporaryHashSinkOperator(org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

MapJoinTableContainer (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer)9 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 IOException (java.io.IOException)2 ImmutablePair (org.apache.commons.lang3.tuple.ImmutablePair)2 Pair (org.apache.commons.lang3.tuple.Pair)2 ObjectPair (org.apache.hadoop.hive.common.ObjectPair)2 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)2 HashTableSinkOperator (org.apache.hadoop.hive.ql.exec.HashTableSinkOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 TemporaryHashSinkOperator (org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator)2 HybridHashTableContainer (org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer)2 MapJoinTableContainerSerDe (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe)2 MapredLocalWork (org.apache.hadoop.hive.ql.plan.MapredLocalWork)2 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)2 JobConf (org.apache.hadoop.mapred.JobConf)2 KryoException (com.esotericsoftware.kryo.KryoException)1 ExecMapperContext (org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext)1 MapredLocalTask (org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask)1 HashMapWrapper (org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper)1