Search in sources :

Example 31 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class AbstractJoinTaskDispatcher method getTotalKnownInputSize.

public long getTotalKnownInputSize(Context context, MapWork currWork, Map<Path, ArrayList<String>> pathToAliases, HashMap<String, Long> aliasToSize) throws SemanticException {
    try {
        // go over all the input paths, and calculate a known total size, known
        // size for each input alias.
        Utilities.getInputSummary(context, currWork, null).getLength();
        // set alias to size mapping, this can be used to determine if one table
        // is chosen as big table, what's the total size of left tables, which
        // are going to be small tables.
        long aliasTotalKnownInputSize = 0L;
        for (Map.Entry<Path, ArrayList<String>> entry : pathToAliases.entrySet()) {
            Path path = entry.getKey();
            List<String> aliasList = entry.getValue();
            ContentSummary cs = context.getCS(path);
            if (cs != null) {
                long size = cs.getLength();
                for (String alias : aliasList) {
                    aliasTotalKnownInputSize += size;
                    Long es = aliasToSize.get(alias);
                    if (es == null) {
                        es = new Long(0);
                    }
                    es += size;
                    aliasToSize.put(alias, es);
                }
            }
        }
        return aliasTotalKnownInputSize;
    } catch (Exception e) {
        e.printStackTrace();
        throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ContentSummary(org.apache.hadoop.fs.ContentSummary) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 32 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class SymlinkTextInputFormat method getContentSummary.

@Override
public ContentSummary getContentSummary(Path p, JobConf job) throws IOException {
    //length, file count, directory count
    long[] summary = { 0, 0, 0 };
    List<Path> targetPaths = new ArrayList<Path>();
    List<Path> symlinkPaths = new ArrayList<Path>();
    try {
        getTargetPathsFromSymlinksDirs(job, new Path[] { p }, targetPaths, symlinkPaths);
    } catch (Exception e) {
        throw new IOException("Error parsing symlinks from specified job input path.", e);
    }
    for (Path path : targetPaths) {
        FileSystem fs = path.getFileSystem(job);
        ContentSummary cs = fs.getContentSummary(path);
        summary[0] += cs.getLength();
        summary[1] += cs.getFileCount();
        summary[2] += cs.getDirectoryCount();
    }
    return new ContentSummary(summary[0], summary[1], summary[2]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) ContentSummary(org.apache.hadoop.fs.ContentSummary) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException)

Example 33 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class IndexWhereProcessor method rewriteForIndexes.

/**
   * Get a list of Tasks to activate use of tsToIndices.
   * Generate the tasks for the index query (where we store results of
   * querying the index in a tmp file) inside the IndexHandler
   * @param predicate Predicate of query to rewrite
   * @param index Index to use for rewrite
   * @param pctx
   * @param task original task before rewrite
   * @param queryContext stores return values
   */
private void rewriteForIndexes(ExprNodeDesc predicate, List<Index> indexes, ParseContext pctx, Task<MapredWork> task, HiveIndexQueryContext queryContext) throws SemanticException {
    HiveIndexHandler indexHandler;
    // All tsToIndices in the list are of the same type, and therefore can use the
    // same handler to generate the index query tasks
    Index index = indexes.get(0);
    try {
        indexHandler = HiveUtils.getIndexHandler(pctx.getConf(), index.getIndexHandlerClass());
    } catch (HiveException e) {
        LOG.error("Exception while loading IndexHandler: " + index.getIndexHandlerClass(), e);
        throw new SemanticException("Failed to load indexHandler: " + index.getIndexHandlerClass(), e);
    }
    // check the size
    try {
        ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null);
        long inputSize = inputSummary.getLength();
        if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) {
            queryContext.setQueryTasks(null);
            return;
        }
    } catch (IOException e) {
        throw new SemanticException("Failed to get task size", e);
    }
    // use the IndexHandler to generate the index query
    indexHandler.generateIndexQuery(indexes, predicate, pctx, queryContext);
    return;
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HiveIndexHandler(org.apache.hadoop.hive.ql.index.HiveIndexHandler) ContentSummary(org.apache.hadoop.fs.ContentSummary) Index(org.apache.hadoop.hive.metastore.api.Index) IOException(java.io.IOException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 34 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class FileUtils method copy.

@VisibleForTesting
static boolean copy(FileSystem srcFS, Path src, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, HiveConf conf, HadoopShims shims) throws IOException {
    boolean copied = false;
    boolean triedDistcp = false;
    /* Run distcp if source file/dir is too big */
    if (srcFS.getUri().getScheme().equals("hdfs")) {
        ContentSummary srcContentSummary = srcFS.getContentSummary(src);
        if (srcContentSummary.getFileCount() > conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXNUMFILES) && srcContentSummary.getLength() > conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXSIZE)) {
            LOG.info("Source is " + srcContentSummary.getLength() + " bytes. (MAX: " + conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXSIZE) + ")");
            LOG.info("Source is " + srcContentSummary.getFileCount() + " files. (MAX: " + conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXNUMFILES) + ")");
            LOG.info("Launch distributed copy (distcp) job.");
            triedDistcp = true;
            copied = shims.runDistCp(src, dst, conf);
            if (copied && deleteSource) {
                srcFS.delete(src, true);
            }
        }
    }
    if (!triedDistcp) {
        copied = FileUtil.copy(srcFS, src, dstFS, dst, deleteSource, overwrite, conf);
    }
    boolean inheritPerms = conf.getBoolVar(HiveConf.ConfVars.HIVE_WAREHOUSE_SUBDIR_INHERIT_PERMS);
    if (copied && inheritPerms) {
        HdfsUtils.setFullFileStatus(conf, new HdfsUtils.HadoopFileStatus(conf, dstFS, dst.getParent()), dstFS, dst, true);
    }
    return copied;
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) HdfsUtils(org.apache.hadoop.hive.io.HdfsUtils) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 35 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class HiveHarFileSystem method getContentSummary.

@Override
public ContentSummary getContentSummary(Path f) throws IOException {
    // HarFileSystem has a bug where this method does not work properly
    // if the underlying FS is HDFS. See MAPREDUCE-1877 for more
    // information. This method is from FileSystem.
    FileStatus status = getFileStatus(f);
    if (!status.isDir()) {
        // f is a file
        return new ContentSummary(status.getLen(), 1, 0);
    }
    // f is a directory
    long[] summary = { 0, 0, 1 };
    for (FileStatus s : listStatus(f)) {
        ContentSummary c = s.isDir() ? getContentSummary(s.getPath()) : new ContentSummary(s.getLen(), 1, 0);
        summary[0] += c.getLength();
        summary[1] += c.getFileCount();
        summary[2] += c.getDirectoryCount();
    }
    return new ContentSummary(summary[0], summary[1], summary[2]);
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) ContentSummary(org.apache.hadoop.fs.ContentSummary)

Aggregations

ContentSummary (org.apache.hadoop.fs.ContentSummary)61 Path (org.apache.hadoop.fs.Path)42 Test (org.junit.Test)38 FileSystem (org.apache.hadoop.fs.FileSystem)10 IOException (java.io.IOException)9 Configuration (org.apache.hadoop.conf.Configuration)8 ArrayList (java.util.ArrayList)6 OutputStream (java.io.OutputStream)5 URI (java.net.URI)5 DSQuotaExceededException (org.apache.hadoop.hdfs.protocol.DSQuotaExceededException)5 QuotaExceededException (org.apache.hadoop.hdfs.protocol.QuotaExceededException)5 WebHdfsFileSystem (org.apache.hadoop.hdfs.web.WebHdfsFileSystem)5 JobConf (org.apache.hadoop.mapred.JobConf)5 HttpURLConnection (java.net.HttpURLConnection)4 HashMap (java.util.HashMap)4 Properties (java.util.Properties)4 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)4 NSQuotaExceededException (org.apache.hadoop.hdfs.protocol.NSQuotaExceededException)4 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)4 FileNotFoundException (java.io.FileNotFoundException)3