use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestGetInputSummary method testGetInputSummaryWithContentSummaryInputFormat.
@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
final int BYTES_PER_FILE = 10;
final Collection<Path> testPaths = Arrays.asList(new Path("p1/test.txt"), new Path("p2/test.txt"), new Path("p3/test.txt"), new Path("p4/test.txt"), new Path("p5/test.txt"));
jobConf.setInt(ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
ContentSummaryInputFormatTestClass.setContentSummary(new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
/*
* Write more bytes to the files to test that ContentSummaryInputFormat is
* actually working returning the file size not from the filesystem
*/
ContentSummary summary = runTestGetInputSummary(jobConf, properties, testPaths, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class, Collections.emptyMap());
assertEquals(testPaths.size() * BYTES_PER_FILE, summary.getLength());
assertEquals(testPaths.size() * 2, summary.getFileCount());
assertEquals(testPaths.size(), summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestGetInputSummary method testGetInputSummaryWithASingleThread.
@Test
public void testGetInputSummaryWithASingleThread() throws IOException {
final int BYTES_PER_FILE = 5;
// Set to zero threads to disable thread pool
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
final Collection<Path> testPaths = Arrays.asList(new Path("p1/test.txt"), new Path("p2/test.txt"), new Path("p3/test.txt"), new Path("p4/test.txt"), new Path("p5/test.txt"));
ContentSummary summary = runTestGetInputSummary(jobConf, properties, testPaths, BYTES_PER_FILE, HiveInputFormat.class, Collections.emptyMap());
assertEquals(testPaths.size() * BYTES_PER_FILE, summary.getLength());
assertEquals(testPaths.size(), summary.getFileCount());
assertEquals(testPaths.size(), summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class CopyUtils method regularCopy.
/*
Check for conditions that will lead to local copy, checks are:
1. we are testing hive.
2. either source or destination is a "local" FileSystem("file")
3. aggregate fileSize of all source Paths(can be directory / file) is less than configured size.
4. number of files of all source Paths(can be directory / file) is less than configured size.
*/
boolean regularCopy(FileSystem sourceFs, List<ReplChangeManager.FileInfo> fileList) throws IOException {
if (hiveInReplTest) {
return true;
}
if (isLocal(sourceFs) || isLocal(destinationFs)) {
return true;
}
/*
we have reached the point where we are transferring files across fileSystems.
*/
long size = 0;
long numberOfFiles = 0;
for (ReplChangeManager.FileInfo fileInfo : fileList) {
ContentSummary contentSummary = null;
try {
contentSummary = getContentSummary(sourceFs, fileInfo.getEffectivePath());
} catch (IOException e) {
// In replication, if source file does not exist, try cmroot
if (fileInfo.isUseSourcePath() && fileInfo.getCmPath() != null) {
contentSummary = getContentSummary(sourceFs, fileInfo.getCmPath());
fileInfo.setIsUseSourcePath(false);
}
}
if (contentSummary != null) {
size += contentSummary.getLength();
numberOfFiles += contentSummary.getFileCount();
if (limitReachedForLocalCopy(size, numberOfFiles)) {
return false;
}
}
}
return true;
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class QueryResultsCache method calculateEntrySize.
private void calculateEntrySize(CacheEntry entry, FetchWork fetchWork) throws IOException {
Path queryResultsPath = fetchWork.getTblDir();
FileSystem resultsFs = queryResultsPath.getFileSystem(conf);
ContentSummary cs = resultsFs.getContentSummary(queryResultsPath);
entry.size = cs.getLength();
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class MapReduceCompiler method decideExecMode.
@Override
protected void decideExecMode(List<Task<?>> rootTasks, Context ctx, GlobalLimitCtx globalLimitCtx) throws SemanticException {
// bypass for explain queries for now
if (ctx.isExplainSkipExecution()) {
return;
}
// user has told us to run in local mode or doesn't want auto-local mode
if (ctx.isLocalOnlyExecutionMode() || !conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
return;
}
final Context lCtx = ctx;
PathFilter p = new PathFilter() {
@Override
public boolean accept(Path file) {
return !lCtx.isMRTmpFileURI(file.toUri().getPath());
}
};
List<ExecDriver> mrtasks = Utilities.getMRTasks(rootTasks);
// map-reduce jobs will be run locally based on data size
// first find out if any of the jobs needs to run non-locally
boolean hasNonLocalJob = false;
for (ExecDriver mrtask : mrtasks) {
try {
ContentSummary inputSummary = Utilities.getInputSummary(ctx, mrtask.getWork().getMapWork(), p);
int numReducers = getNumberOfReducers(mrtask.getWork(), conf);
long estimatedInput;
if (globalLimitCtx != null && globalLimitCtx.isEnable()) {
// If the global limit optimization is triggered, we will
// estimate input data actually needed based on limit rows.
// estimated Input = (num_limit * max_size_per_row) * (estimated_map + 2)
//
long sizePerRow = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
estimatedInput = (globalLimitCtx.getGlobalOffset() + globalLimitCtx.getGlobalLimit()) * sizePerRow;
long minSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE);
long estimatedNumMap = inputSummary.getLength() / minSplitSize + 1;
estimatedInput = estimatedInput * (estimatedNumMap + 1);
} else {
estimatedInput = inputSummary.getLength();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Task: " + mrtask.getId() + ", Summary: " + inputSummary.getLength() + "," + inputSummary.getFileCount() + "," + numReducers + ", estimated Input: " + estimatedInput);
}
if (MapRedTask.isEligibleForLocalMode(conf, numReducers, estimatedInput, inputSummary.getFileCount()) != null) {
hasNonLocalJob = true;
break;
} else {
mrtask.setLocalMode(true);
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
if (!hasNonLocalJob) {
// Entire query can be run locally.
// Save the current tracker value and restore it when done.
ctx.setOriginalTracker(ShimLoader.getHadoopShims().getJobLauncherRpcAddress(conf));
ShimLoader.getHadoopShims().setJobLauncherRpcAddress(conf, "local");
console.printInfo("Automatically selecting local only mode for query");
}
}
Aggregations