Search in sources :

Example 1 with SparkBucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext in project hive by apache.

the class SparkHashTableSinkOperator method flushToFile.

protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
    MapredLocalWork localWork = getExecContext().getLocalWork();
    BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
    Path inputPath = getExecContext().getCurrentInputPath();
    String bigInputPath = null;
    if (inputPath != null && mapJoinCtx != null) {
        Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
        bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
    }
    // get tmp file URI
    Path tmpURI = localWork.getTmpHDFSPath();
    LOG.info("Temp URI for side table: " + tmpURI);
    // get current bucket file name
    String fileName = localWork.getBucketFileName(bigInputPath);
    // get the tmp URI path; it will be a hdfs path if not local mode
    String dumpFilePrefix = conf.getDumpFilePrefix();
    Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
    FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
    short replication = fs.getDefaultReplication(path);
    // Create the folder and its parents if not there
    fs.mkdirs(path);
    while (true) {
        path = new Path(path, getOperatorId() + "-" + Math.abs(Utilities.randGen.nextInt()));
        try {
            // This will guarantee file name uniqueness.
            if (fs.createNewFile(path)) {
                break;
            }
        } catch (FileExistsException e) {
        // No problem, use a new name
        }
    }
    // TODO find out numOfPartitions for the big table
    int numOfPartitions = replication;
    replication = (short) Math.max(minReplication, numOfPartitions);
    htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
    try {
        // get the hashtable file and path
        OutputStream os = null;
        ObjectOutputStream out = null;
        MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
        try {
            os = fs.create(path, replication);
            out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
            mapJoinTableSerde.persist(out, tableContainer);
        } finally {
            if (out != null) {
                out.close();
            } else if (os != null) {
                os.close();
            }
        }
        FileStatus status = fs.getFileStatus(path);
        htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
    } catch (Exception e) {
        // Failed to dump the side-table, remove the partial file
        try {
            fs.delete(path, false);
        } catch (Exception ex) {
            LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
        }
        throw e;
    }
    tableContainer.clear();
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) OutputStream(java.io.OutputStream) BufferedOutputStream(java.io.BufferedOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) FileExistsException(org.apache.commons.io.FileExistsException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) FileSystem(org.apache.hadoop.fs.FileSystem) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) BufferedOutputStream(java.io.BufferedOutputStream) FileExistsException(org.apache.commons.io.FileExistsException)

Example 2 with SparkBucketMapJoinContext

use of org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext in project hive by apache.

the class HashTableLoader method load.

@Override
public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainerSerDe[] mapJoinTableSerdes) throws HiveException {
    // Note: it's possible that a MJ operator is in a ReduceWork, in which case the
    // currentInputPath will be null. But, since currentInputPath is only interesting
    // for bucket join case, and for bucket join the MJ operator will always be in
    // a MapWork, this should be OK.
    String currentInputPath = context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();
    LOG.info("******* Load from HashTable for input file: " + currentInputPath);
    MapredLocalWork localWork = context.getLocalWork();
    try {
        if (localWork.getDirectFetchOp() != null) {
            loadDirectly(mapJoinTables, currentInputPath);
        }
        // All HashTables share the same base dir,
        // which is passed in as the tmp path
        Path baseDir = localWork.getTmpPath();
        if (baseDir == null) {
            return;
        }
        FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
        BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
        boolean firstContainer = true;
        boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
        for (int pos = 0; pos < mapJoinTables.length; pos++) {
            if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
                continue;
            }
            if (useOptimizedContainer) {
                MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
                ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
                if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
                    if (firstContainer) {
                        LOG.warn("Not using optimized table container." + "Only a subset of mapjoin keys is supported.");
                        useOptimizedContainer = false;
                        HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
                    } else {
                        throw new HiveException("Only a subset of mapjoin keys is supported.");
                    }
                }
            }
            firstContainer = false;
            String bigInputPath = currentInputPath;
            if (currentInputPath != null && mapJoinCtx != null) {
                if (!desc.isBucketMapJoin()) {
                    bigInputPath = null;
                } else {
                    Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
                    String alias = aliases.iterator().next();
                    // Any one small table input path
                    String smallInputPath = mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
                    bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
                }
            }
            String fileName = localWork.getBucketFileName(bigInputPath);
            Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
            mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
        }
    } catch (Exception e) {
        throw new HiveException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSystem(org.apache.hadoop.fs.FileSystem) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) MapJoinObjectSerDeContext(org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Aggregations

FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 BucketMapJoinContext (org.apache.hadoop.hive.ql.plan.BucketMapJoinContext)2 MapredLocalWork (org.apache.hadoop.hive.ql.plan.MapredLocalWork)2 SparkBucketMapJoinContext (org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext)2 BufferedOutputStream (java.io.BufferedOutputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 OutputStream (java.io.OutputStream)1 FileExistsException (org.apache.commons.io.FileExistsException)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 MapJoinObjectSerDeContext (org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext)1 MapJoinTableContainerSerDe (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1