Examples with MapJoinTableContainerSerDe - org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe

Example 1 with MapJoinTableContainerSerDe

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.

the class MapJoinOperator method completeInitializationOp.

@SuppressWarnings("unchecked")
@Override
protected void completeInitializationOp(Object[] os) throws HiveException {
    if (os.length != 0) {
        Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair = (Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>) os[0];
        boolean spilled = false;
        for (MapJoinTableContainer container : pair.getLeft()) {
            if (container != null) {
                spilled = spilled || container.hasSpill();
            }
        }
        if (spilled) {
            // we can't use the cached table because it has spilled.
            loadHashTable(getExecContext(), MapredContext.get());
        } else {
            if (LOG.isDebugEnabled()) {
                String s = "Using tables from cache: [";
                for (MapJoinTableContainer c : pair.getLeft()) {
                    s += ((c == null) ? "null" : c.getClass().getSimpleName()) + ", ";
                }
                LOG.debug(s + "]");
            }
            // let's use the table from the cache.
            mapJoinTables = pair.getLeft();
            mapJoinTableSerdes = pair.getRight();
        }
        hashTblInitedOnce = true;
    }
    if (this.getExecContext() != null) {
        // reset exec context so that initialization of the map operator happens
        // properly
        this.getExecContext().setLastInputPath(null);
        this.getExecContext().setCurrentInputPath(null);
    }
}

Also used : MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Example 2 with MapJoinTableContainerSerDe

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.

the class SparkHashTableSinkOperator method flushToFile.

protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
    MapredLocalWork localWork = getExecContext().getLocalWork();
    BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
    Path inputPath = getExecContext().getCurrentInputPath();
    String bigInputPath = null;
    if (inputPath != null && mapJoinCtx != null) {
        Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
        bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
    }
    // get tmp file URI
    Path tmpURI = localWork.getTmpHDFSPath();
    LOG.info("Temp URI for side table: " + tmpURI);
    // get current bucket file name
    String fileName = localWork.getBucketFileName(bigInputPath);
    // get the tmp URI path; it will be a hdfs path if not local mode
    String dumpFilePrefix = conf.getDumpFilePrefix();
    Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
    FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
    // Create the folder and its parents if not there
    fs.mkdirs(path);
    while (true) {
        path = new Path(path, getOperatorId() + "-" + Math.abs(Utilities.randGen.nextInt()));
        try {
            // This will guarantee file name uniqueness.
            if (fs.createNewFile(path)) {
                break;
            }
        } catch (FileExistsException e) {
        // No problem, use a new name
        }
    }
    htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
    try {
        // get the hashtable file and path
        OutputStream os = null;
        ObjectOutputStream out = null;
        MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
        try {
            os = fs.create(path, numReplication);
            out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
            mapJoinTableSerde.persist(out, tableContainer);
        } finally {
            if (out != null) {
                out.close();
            } else if (os != null) {
                os.close();
            }
        }
        FileStatus status = fs.getFileStatus(path);
        htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
    } catch (Exception e) {
        // Failed to dump the side-table, remove the partial file
        try {
            fs.delete(path, false);
        } catch (Exception ex) {
            LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
        }
        throw e;
    }
    tableContainer.clear();
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) OutputStream(java.io.OutputStream) BufferedOutputStream(java.io.BufferedOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) FileExistsException(org.apache.commons.io.FileExistsException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) FileSystem(org.apache.hadoop.fs.FileSystem) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) BufferedOutputStream(java.io.BufferedOutputStream) FileExistsException(org.apache.commons.io.FileExistsException)

Example 3 with MapJoinTableContainerSerDe

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.

the class MapJoinOperator method generateMapMetaData.

public void generateMapMetaData() throws HiveException {
    try {
        TableDesc keyTableDesc = conf.getKeyTblDesc();
        AbstractSerDe keySerializer = (AbstractSerDe) ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null);
        MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false);
        for (int pos = 0; pos < order.length; pos++) {
            if (pos == posBigTable) {
                continue;
            }
            TableDesc valueTableDesc;
            if (conf.getNoOuterJoin()) {
                valueTableDesc = conf.getValueTblDescs().get(pos);
            } else {
                valueTableDesc = conf.getValueFilteredTblDescs().get(pos);
            }
            AbstractSerDe valueSerDe = (AbstractSerDe) ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
            MapJoinObjectSerDeContext valueContext = new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos));
            mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext);
        }
    } catch (SerDeException e) {
        throw new HiveException(e);
    }
}

Also used : MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinObjectSerDeContext(org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 4 with MapJoinTableContainerSerDe

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.

the class HashTableSinkOperator method initializeOp.

@Override
@SuppressWarnings("unchecked")
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    boolean isSilent = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVESESSIONSILENT);
    console = new LogHelper(LOG, isSilent);
    memoryExhaustionHandler = new MapJoinMemoryExhaustionHandler(console, conf.getHashtableMemoryUsage());
    emptyRowContainer.addRow(emptyObjectArray);
    // for small tables only; so get the big table position first
    posBigTableAlias = conf.getPosBigTable();
    order = conf.getTagOrder();
    // initialize some variables, which used to be initialized in CommonJoinOperator
    this.hconf = hconf;
    filterMaps = conf.getFilterMap();
    int tagLen = conf.getTagLength();
    // process join keys
    joinKeys = new List[tagLen];
    JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), posBigTableAlias, hconf);
    joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors, posBigTableAlias, tagLen);
    // process join values
    joinValues = new List[tagLen];
    JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), posBigTableAlias, hconf);
    joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues, inputObjInspectors, posBigTableAlias, tagLen);
    // process join filters
    joinFilters = new List[tagLen];
    JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), posBigTableAlias, hconf);
    joinFilterObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinFilters, inputObjInspectors, posBigTableAlias, tagLen);
    if (!conf.isNoOuterJoin()) {
        for (Byte alias : order) {
            if (alias == posBigTableAlias || joinValues[alias] == null) {
                continue;
            }
            List<ObjectInspector> rcOIs = joinValuesObjectInspectors[alias];
            if (filterMaps != null && filterMaps[alias] != null) {
                // for each alias, add object inspector for filter tag as the last element
                rcOIs = new ArrayList<ObjectInspector>(rcOIs);
                rcOIs.add(PrimitiveObjectInspectorFactory.writableShortObjectInspector);
            }
        }
    }
    mapJoinTables = new MapJoinPersistableTableContainer[tagLen];
    mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
    hashTableScale = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVEHASHTABLESCALE);
    if (hashTableScale <= 0) {
        hashTableScale = 1;
    }
    try {
        TableDesc keyTableDesc = conf.getKeyTblDesc();
        AbstractSerDe keySerde = (AbstractSerDe) ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(keySerde, null, keyTableDesc.getProperties(), null);
        MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerde, false);
        for (Byte pos : order) {
            if (pos == posBigTableAlias) {
                continue;
            }
            mapJoinTables[pos] = new HashMapWrapper(hconf, -1);
            TableDesc valueTableDesc = conf.getValueTblFilteredDescs().get(pos);
            AbstractSerDe valueSerDe = (AbstractSerDe) ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
            mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos)));
        }
    } catch (SerDeException e) {
        throw new HiveException(e);
    }
}

Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) HashMapWrapper(org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper) MapJoinMemoryExhaustionHandler(org.apache.hadoop.hive.ql.exec.mapjoin.MapJoinMemoryExhaustionHandler) MapJoinObjectSerDeContext(org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 5 with MapJoinTableContainerSerDe

use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.

the class MapJoinOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    this.hconf = hconf;
    unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
    super.initializeOp(hconf);
    int tagLen = conf.getTagLength();
    // On Tez only: The hash map might already be cached in the container we run
    // the task in. On MR: The cache is a no-op.
    String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
    cacheKey = "HASH_MAP_" + this.getOperatorId() + "_container";
    cache = ObjectCacheFactory.getCache(hconf, queryId, false);
    loader = getHashTableLoader(hconf);
    bucketId = hconf.getInt(Constants.LLAP_BUCKET_ID, -1);
    numBuckets = hconf.getInt(Constants.LLAP_NUM_BUCKETS, -1);
    hashMapRowGetters = null;
    mapJoinTables = new MapJoinTableContainer[tagLen];
    mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
    hashTblInitedOnce = false;
    // Reset grace hashjoin context so that there is no state maintained when operator/work is
    // retrieved from object cache
    hybridMapJoinLeftover = false;
    firstSmallTable = null;
    generateMapMetaData();
    isTestingNoHashTableLoad = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD);
    if (isTestingNoHashTableLoad) {
        return;
    }
    final ExecMapperContext mapContext = getExecContext();
    final MapredContext mrContext = MapredContext.get();
    if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
        /*
       * The issue with caching in case of bucket map join is that different tasks
       * process different buckets and if the container is reused to join a different bucket,
       * join results can be incorrect. The cache is keyed on operator id and for bucket map join
       * the operator does not change but data needed is different. For a proper fix, this
       * requires changes in the Tez API with regard to finding bucket id and
       * also ability to schedule tasks to re-use containers that have cached the specific bucket.
       */
        if (LOG.isDebugEnabled()) {
            LOG.debug("This is not bucket map join, so cache");
        }
        Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync(cacheKey, () -> loadHashTable(mapContext, mrContext));
        asyncInitOperations.add(future);
    } else if (!isInputFileChangeSensitive(mapContext)) {
        loadHashTable(mapContext, mrContext);
        hashTblInitedOnce = true;
    }
}

Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Aggregations

MapJoinTableContainerSerDe (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe)8 MapJoinObjectSerDeContext (org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext)4 MapJoinTableContainer (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)3 AbstractSerDe (org.apache.hadoop.hive.serde2.AbstractSerDe)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 ImmutablePair (org.apache.commons.lang3.tuple.ImmutablePair)2 Pair (org.apache.commons.lang3.tuple.Pair)2 ObjectPair (org.apache.hadoop.hive.common.ObjectPair)2 HashMapWrapper (org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper)2 MapJoinBytesTableContainer (org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer)2 VectorizationContext (org.apache.hadoop.hive.ql.exec.vector.VectorizationContext)2 VectorExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression)2 VectorMapJoinDesc (org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 BufferedOutputStream (java.io.BufferedOutputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 OutputStream (java.io.OutputStream)1