use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class MapReduceCompiler method decideExecMode.
@Override
protected void decideExecMode(List<Task<? extends Serializable>> rootTasks, Context ctx, GlobalLimitCtx globalLimitCtx) throws SemanticException {
// bypass for explain queries for now
if (ctx.isExplainSkipExecution()) {
return;
}
// user has told us to run in local mode or doesn't want auto-local mode
if (ctx.isLocalOnlyExecutionMode() || !conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
return;
}
final Context lCtx = ctx;
PathFilter p = new PathFilter() {
@Override
public boolean accept(Path file) {
return !lCtx.isMRTmpFileURI(file.toUri().getPath());
}
};
List<ExecDriver> mrtasks = Utilities.getMRTasks(rootTasks);
// map-reduce jobs will be run locally based on data size
// first find out if any of the jobs needs to run non-locally
boolean hasNonLocalJob = false;
for (ExecDriver mrtask : mrtasks) {
try {
ContentSummary inputSummary = Utilities.getInputSummary(ctx, mrtask.getWork().getMapWork(), p);
int numReducers = getNumberOfReducers(mrtask.getWork(), conf);
long estimatedInput;
if (globalLimitCtx != null && globalLimitCtx.isEnable()) {
// If the global limit optimization is triggered, we will
// estimate input data actually needed based on limit rows.
// estimated Input = (num_limit * max_size_per_row) * (estimated_map + 2)
//
long sizePerRow = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
estimatedInput = (globalLimitCtx.getGlobalOffset() + globalLimitCtx.getGlobalLimit()) * sizePerRow;
long minSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE);
long estimatedNumMap = inputSummary.getLength() / minSplitSize + 1;
estimatedInput = estimatedInput * (estimatedNumMap + 1);
} else {
estimatedInput = inputSummary.getLength();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Task: " + mrtask.getId() + ", Summary: " + inputSummary.getLength() + "," + inputSummary.getFileCount() + "," + numReducers + ", estimated Input: " + estimatedInput);
}
if (MapRedTask.isEligibleForLocalMode(conf, numReducers, estimatedInput, inputSummary.getFileCount()) != null) {
hasNonLocalJob = true;
break;
} else {
mrtask.setLocalMode(true);
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
if (!hasNonLocalJob) {
// Entire query can be run locally.
// Save the current tracker value and restore it when done.
ctx.setOriginalTracker(ShimLoader.getHadoopShims().getJobLauncherRpcAddress(conf));
ShimLoader.getHadoopShims().setJobLauncherRpcAddress(conf, "local");
console.printInfo("Automatically selecting local only mode for query");
}
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestUtilities method testGetInputSummaryWithASingleThread.
@Test
public void testGetInputSummaryWithASingleThread() throws IOException {
final int NUM_PARTITIONS = 5;
final int BYTES_PER_FILE = 5;
JobConf jobConf = new JobConf();
Properties properties = new Properties();
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestUtilities method testGetInputSummaryWithInputEstimator.
@Test
public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException {
final int NUM_PARTITIONS = 5;
final int BYTES_PER_FILE = 10;
final int NUM_OF_ROWS = 5;
JobConf jobConf = new JobConf();
Properties properties = new Properties();
jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName());
InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE));
/* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */
ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
// Current getInputSummary() returns -1 for each file found
assertEquals(NUM_PARTITIONS * -1, summary.getFileCount());
// Current getInputSummary() returns -1 for each file found
assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount());
// Test deprecated mapred.dfsclient.parallelism.max
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName());
InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE));
/* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */
summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
// Current getInputSummary() returns -1 for each file found
assertEquals(NUM_PARTITIONS * -1, summary.getFileCount());
// Current getInputSummary() returns -1 for each file found
assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hadoop by apache.
the class TestChRootedFileSystem method testGetContentSummary.
@Test
public void testGetContentSummary() throws IOException {
// GetContentSummary of a dir
fSys.mkdirs(new Path("/newDir/dirFoo"));
ContentSummary cs = fSys.getContentSummary(new Path("/newDir/dirFoo"));
Assert.assertEquals(-1L, cs.getQuota());
Assert.assertEquals(-1L, cs.getSpaceQuota());
}
use of org.apache.hadoop.fs.ContentSummary in project hadoop by apache.
the class BaseTestHttpFSWith method testContentSummary.
private void testContentSummary() throws Exception {
FileSystem fs = FileSystem.get(getProxiedFSConf());
Path path = new Path(getProxiedFSTestDir(), "foo.txt");
OutputStream os = fs.create(path);
os.write(1);
os.close();
ContentSummary hdfsContentSummary = fs.getContentSummary(path);
fs.close();
fs = getHttpFSFileSystem();
ContentSummary httpContentSummary = fs.getContentSummary(path);
fs.close();
assertEquals(httpContentSummary.getDirectoryCount(), hdfsContentSummary.getDirectoryCount());
assertEquals(httpContentSummary.getFileCount(), hdfsContentSummary.getFileCount());
assertEquals(httpContentSummary.getLength(), hdfsContentSummary.getLength());
assertEquals(httpContentSummary.getQuota(), hdfsContentSummary.getQuota());
assertEquals(httpContentSummary.getSpaceConsumed(), hdfsContentSummary.getSpaceConsumed());
assertEquals(httpContentSummary.getSpaceQuota(), hdfsContentSummary.getSpaceQuota());
}
Aggregations