Search in sources :

Example 1 with BucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.

the class SMBMapJoinOperator method setUpFetchContexts.

private void setUpFetchContexts(String alias, MergeQueue mergeQueue) throws HiveException {
    mergeQueue.clearFetchContext();
    Path currentInputPath = getExecContext().getCurrentInputPath();
    BucketMapJoinContext bucketMatcherCxt = localWork.getBucketMapjoinContext();
    Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
    BucketMatcher bucketMatcher = ReflectionUtil.newInstance(bucketMatcherCls, null);
    getExecContext().setFileId(bucketMatcherCxt.createFileId(currentInputPath.toString()));
    LOG.info("set task id: " + getExecContext().getFileId());
    bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());
    List<Path> aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputPath.toString(), bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
    mergeQueue.setupContext(aliasFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext)

Example 2 with BucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.

the class SparkHashTableSinkOperator method flushToFile.

protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
    MapredLocalWork localWork = getExecContext().getLocalWork();
    BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
    Path inputPath = getExecContext().getCurrentInputPath();
    String bigInputPath = null;
    if (inputPath != null && mapJoinCtx != null) {
        Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
        bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
    }
    // get tmp file URI
    Path tmpURI = localWork.getTmpHDFSPath();
    LOG.info("Temp URI for side table: " + tmpURI);
    // get current bucket file name
    String fileName = localWork.getBucketFileName(bigInputPath);
    // get the tmp URI path; it will be a hdfs path if not local mode
    String dumpFilePrefix = conf.getDumpFilePrefix();
    Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
    FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
    // Create the folder and its parents if not there
    fs.mkdirs(path);
    while (true) {
        path = new Path(path, getOperatorId() + "-" + Math.abs(ThreadLocalRandom.current().nextInt()));
        try {
            // This will guarantee file name uniqueness.
            if (fs.createNewFile(path)) {
                break;
            }
        } catch (FileExistsException e) {
        // No problem, use a new name
        }
    }
    htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
    try {
        // get the hashtable file and path
        OutputStream os = null;
        ObjectOutputStream out = null;
        MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
        try {
            os = fs.create(path, numReplication);
            out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
            mapJoinTableSerde.persist(out, tableContainer);
        } finally {
            if (out != null) {
                out.close();
            } else if (os != null) {
                os.close();
            }
        }
        FileStatus status = fs.getFileStatus(path);
        htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
    } catch (Exception e) {
        // Failed to dump the side-table, remove the partial file
        try {
            fs.delete(path, false);
        } catch (Exception ex) {
            LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
        }
        throw e;
    }
    tableContainer.clear();
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) BufferedOutputStream(java.io.BufferedOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) OutputStream(java.io.OutputStream) ObjectOutputStream(java.io.ObjectOutputStream) FileExistsException(org.apache.commons.io.FileExistsException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) FileSystem(org.apache.hadoop.fs.FileSystem) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) BufferedOutputStream(java.io.BufferedOutputStream) FileExistsException(org.apache.commons.io.FileExistsException)

Example 3 with BucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.

the class MapredLocalTask method setUpFetchOpContext.

private void setUpFetchOpContext(FetchOperator fetchOp, String alias, String currentInputFile) throws Exception {
    BucketMapJoinContext bucketMatcherCxt = this.work.getBucketMapjoinContext();
    Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
    BucketMatcher bucketMatcher = ReflectionUtils.newInstance(bucketMatcherCls, null);
    bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());
    List<Path> aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputFile, bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
    fetchOp.setupContext(aliasFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) BucketMatcher(org.apache.hadoop.hive.ql.exec.BucketMatcher) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext)

Example 4 with BucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.

the class HashTableLoader method load.

@Override
public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainerSerDe[] mapJoinTableSerdes) throws HiveException {
    // Note: it's possible that a MJ operator is in a ReduceWork, in which case the
    // currentInputPath will be null. But, since currentInputPath is only interesting
    // for bucket join case, and for bucket join the MJ operator will always be in
    // a MapWork, this should be OK.
    String currentInputPath = context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();
    LOG.info("******* Load from HashTable for input file: " + currentInputPath);
    MapredLocalWork localWork = context.getLocalWork();
    try {
        if (localWork.getDirectFetchOp() != null) {
            loadDirectly(mapJoinTables, currentInputPath);
        }
        // All HashTables share the same base dir,
        // which is passed in as the tmp path
        Path baseDir = localWork.getTmpPath();
        if (baseDir == null) {
            return;
        }
        FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
        BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
        boolean firstContainer = true;
        boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
        for (int pos = 0; pos < mapJoinTables.length; pos++) {
            if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
                continue;
            }
            if (useOptimizedContainer) {
                MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
                ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
                if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
                    if (firstContainer) {
                        LOG.warn("Not using optimized table container." + "Only a subset of mapjoin keys is supported.");
                        useOptimizedContainer = false;
                        HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
                    } else {
                        throw new HiveException("Only a subset of mapjoin keys is supported.");
                    }
                }
            }
            firstContainer = false;
            String bigInputPath = currentInputPath;
            if (currentInputPath != null && mapJoinCtx != null) {
                if (!desc.isBucketMapJoin()) {
                    bigInputPath = null;
                } else {
                    Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
                    String alias = aliases.iterator().next();
                    // Any one small table input path
                    String smallInputPath = mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
                    bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
                }
            }
            String fileName = localWork.getBucketFileName(bigInputPath);
            Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
            mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
        }
    } catch (Exception e) {
        throw new HiveException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSystem(org.apache.hadoop.fs.FileSystem) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) MapJoinObjectSerDeContext(org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) ExecutionException(java.util.concurrent.ExecutionException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 5 with BucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.

the class SparkSortMergeJoinFactory method setupBucketMapJoinInfo.

private static void setupBucketMapJoinInfo(MapWork plan, SMBMapJoinOperator currMapJoinOp) {
    if (currMapJoinOp != null) {
        Map<String, Map<String, List<String>>> aliasBucketFileNameMapping = currMapJoinOp.getConf().getAliasBucketFileNameMapping();
        if (aliasBucketFileNameMapping != null) {
            MapredLocalWork localPlan = plan.getMapRedLocalWork();
            if (localPlan == null) {
                localPlan = currMapJoinOp.getConf().getLocalWork();
            } else {
                // local plan is not null, we want to merge it into SMBMapJoinOperator's local work
                MapredLocalWork smbLocalWork = currMapJoinOp.getConf().getLocalWork();
                if (smbLocalWork != null) {
                    localPlan.getAliasToFetchWork().putAll(smbLocalWork.getAliasToFetchWork());
                    localPlan.getAliasToWork().putAll(smbLocalWork.getAliasToWork());
                }
            }
            if (localPlan == null) {
                return;
            }
            plan.setMapRedLocalWork(null);
            currMapJoinOp.getConf().setLocalWork(localPlan);
            BucketMapJoinContext bucketMJCxt = new BucketMapJoinContext();
            localPlan.setBucketMapjoinContext(bucketMJCxt);
            bucketMJCxt.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
            bucketMJCxt.setBucketFileNameMapping(currMapJoinOp.getConf().getBigTableBucketNumMapping());
            localPlan.setInputFileChangeSensitive(true);
            bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias());
            bucketMJCxt.setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class);
            bucketMJCxt.setBigTablePartSpecToFileMapping(currMapJoinOp.getConf().getBigTablePartSpecToFileMapping());
            plan.setUseBucketizedHiveInputFormat(true);
        }
    }
}
Also used : BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) Map(java.util.Map)

Aggregations

BucketMapJoinContext (org.apache.hadoop.hive.ql.plan.BucketMapJoinContext)5 Path (org.apache.hadoop.fs.Path)4 MapredLocalWork (org.apache.hadoop.hive.ql.plan.MapredLocalWork)3 FileSystem (org.apache.hadoop.fs.FileSystem)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 SparkBucketMapJoinContext (org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext)2 BufferedOutputStream (java.io.BufferedOutputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 OutputStream (java.io.OutputStream)1 Map (java.util.Map)1 ExecutionException (java.util.concurrent.ExecutionException)1 FileExistsException (org.apache.commons.io.FileExistsException)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 BucketMatcher (org.apache.hadoop.hive.ql.exec.BucketMatcher)1 MapJoinObjectSerDeContext (org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext)1 MapJoinTableContainerSerDe (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1