use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class AbstractJoinTaskDispatcher method getTotalKnownInputSize.
public long getTotalKnownInputSize(Context context, MapWork currWork, Map<Path, ArrayList<String>> pathToAliases, HashMap<String, Long> aliasToSize) throws SemanticException {
try {
// go over all the input paths, and calculate a known total size, known
// size for each input alias.
Utilities.getInputSummary(context, currWork, null).getLength();
// set alias to size mapping, this can be used to determine if one table
// is chosen as big table, what's the total size of left tables, which
// are going to be small tables.
long aliasTotalKnownInputSize = 0L;
for (Map.Entry<Path, ArrayList<String>> entry : pathToAliases.entrySet()) {
Path path = entry.getKey();
List<String> aliasList = entry.getValue();
ContentSummary cs = context.getCS(path);
if (cs != null) {
long size = cs.getLength();
for (String alias : aliasList) {
aliasTotalKnownInputSize += size;
Long es = aliasToSize.get(alias);
if (es == null) {
es = new Long(0);
}
es += size;
aliasToSize.put(alias, es);
}
}
}
return aliasTotalKnownInputSize;
} catch (Exception e) {
e.printStackTrace();
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
}
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class SymlinkTextInputFormat method getContentSummary.
@Override
public ContentSummary getContentSummary(Path p, JobConf job) throws IOException {
//length, file count, directory count
long[] summary = { 0, 0, 0 };
List<Path> targetPaths = new ArrayList<Path>();
List<Path> symlinkPaths = new ArrayList<Path>();
try {
getTargetPathsFromSymlinksDirs(job, new Path[] { p }, targetPaths, symlinkPaths);
} catch (Exception e) {
throw new IOException("Error parsing symlinks from specified job input path.", e);
}
for (Path path : targetPaths) {
FileSystem fs = path.getFileSystem(job);
ContentSummary cs = fs.getContentSummary(path);
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
}
return new ContentSummary(summary[0], summary[1], summary[2]);
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class IndexWhereProcessor method rewriteForIndexes.
/**
* Get a list of Tasks to activate use of tsToIndices.
* Generate the tasks for the index query (where we store results of
* querying the index in a tmp file) inside the IndexHandler
* @param predicate Predicate of query to rewrite
* @param index Index to use for rewrite
* @param pctx
* @param task original task before rewrite
* @param queryContext stores return values
*/
private void rewriteForIndexes(ExprNodeDesc predicate, List<Index> indexes, ParseContext pctx, Task<MapredWork> task, HiveIndexQueryContext queryContext) throws SemanticException {
HiveIndexHandler indexHandler;
// All tsToIndices in the list are of the same type, and therefore can use the
// same handler to generate the index query tasks
Index index = indexes.get(0);
try {
indexHandler = HiveUtils.getIndexHandler(pctx.getConf(), index.getIndexHandlerClass());
} catch (HiveException e) {
LOG.error("Exception while loading IndexHandler: " + index.getIndexHandlerClass(), e);
throw new SemanticException("Failed to load indexHandler: " + index.getIndexHandlerClass(), e);
}
// check the size
try {
ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null);
long inputSize = inputSummary.getLength();
if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) {
queryContext.setQueryTasks(null);
return;
}
} catch (IOException e) {
throw new SemanticException("Failed to get task size", e);
}
// use the IndexHandler to generate the index query
indexHandler.generateIndexQuery(indexes, predicate, pctx, queryContext);
return;
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class FileUtils method copy.
@VisibleForTesting
static boolean copy(FileSystem srcFS, Path src, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, HiveConf conf, HadoopShims shims) throws IOException {
boolean copied = false;
boolean triedDistcp = false;
/* Run distcp if source file/dir is too big */
if (srcFS.getUri().getScheme().equals("hdfs")) {
ContentSummary srcContentSummary = srcFS.getContentSummary(src);
if (srcContentSummary.getFileCount() > conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXNUMFILES) && srcContentSummary.getLength() > conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXSIZE)) {
LOG.info("Source is " + srcContentSummary.getLength() + " bytes. (MAX: " + conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXSIZE) + ")");
LOG.info("Source is " + srcContentSummary.getFileCount() + " files. (MAX: " + conf.getLongVar(HiveConf.ConfVars.HIVE_EXEC_COPYFILE_MAXNUMFILES) + ")");
LOG.info("Launch distributed copy (distcp) job.");
triedDistcp = true;
copied = shims.runDistCp(src, dst, conf);
if (copied && deleteSource) {
srcFS.delete(src, true);
}
}
}
if (!triedDistcp) {
copied = FileUtil.copy(srcFS, src, dstFS, dst, deleteSource, overwrite, conf);
}
boolean inheritPerms = conf.getBoolVar(HiveConf.ConfVars.HIVE_WAREHOUSE_SUBDIR_INHERIT_PERMS);
if (copied && inheritPerms) {
HdfsUtils.setFullFileStatus(conf, new HdfsUtils.HadoopFileStatus(conf, dstFS, dst.getParent()), dstFS, dst, true);
}
return copied;
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class HiveHarFileSystem method getContentSummary.
@Override
public ContentSummary getContentSummary(Path f) throws IOException {
// HarFileSystem has a bug where this method does not work properly
// if the underlying FS is HDFS. See MAPREDUCE-1877 for more
// information. This method is from FileSystem.
FileStatus status = getFileStatus(f);
if (!status.isDir()) {
// f is a file
return new ContentSummary(status.getLen(), 1, 0);
}
// f is a directory
long[] summary = { 0, 0, 1 };
for (FileStatus s : listStatus(f)) {
ContentSummary c = s.isDir() ? getContentSummary(s.getPath()) : new ContentSummary(s.getLen(), 1, 0);
summary[0] += c.getLength();
summary[1] += c.getFileCount();
summary[2] += c.getDirectoryCount();
}
return new ContentSummary(summary[0], summary[1], summary[2]);
}
Aggregations