Search in sources :

Example 36 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class MapReduceCompiler method decideExecMode.

@Override
protected void decideExecMode(List<Task<? extends Serializable>> rootTasks, Context ctx, GlobalLimitCtx globalLimitCtx) throws SemanticException {
    // bypass for explain queries for now
    if (ctx.isExplainSkipExecution()) {
        return;
    }
    // user has told us to run in local mode or doesn't want auto-local mode
    if (ctx.isLocalOnlyExecutionMode() || !conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
        return;
    }
    final Context lCtx = ctx;
    PathFilter p = new PathFilter() {

        @Override
        public boolean accept(Path file) {
            return !lCtx.isMRTmpFileURI(file.toUri().getPath());
        }
    };
    List<ExecDriver> mrtasks = Utilities.getMRTasks(rootTasks);
    // map-reduce jobs will be run locally based on data size
    // first find out if any of the jobs needs to run non-locally
    boolean hasNonLocalJob = false;
    for (ExecDriver mrtask : mrtasks) {
        try {
            ContentSummary inputSummary = Utilities.getInputSummary(ctx, mrtask.getWork().getMapWork(), p);
            int numReducers = getNumberOfReducers(mrtask.getWork(), conf);
            long estimatedInput;
            if (globalLimitCtx != null && globalLimitCtx.isEnable()) {
                // If the global limit optimization is triggered, we will
                // estimate input data actually needed based on limit rows.
                // estimated Input = (num_limit * max_size_per_row) * (estimated_map + 2)
                //
                long sizePerRow = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
                estimatedInput = (globalLimitCtx.getGlobalOffset() + globalLimitCtx.getGlobalLimit()) * sizePerRow;
                long minSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE);
                long estimatedNumMap = inputSummary.getLength() / minSplitSize + 1;
                estimatedInput = estimatedInput * (estimatedNumMap + 1);
            } else {
                estimatedInput = inputSummary.getLength();
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Task: " + mrtask.getId() + ", Summary: " + inputSummary.getLength() + "," + inputSummary.getFileCount() + "," + numReducers + ", estimated Input: " + estimatedInput);
            }
            if (MapRedTask.isEligibleForLocalMode(conf, numReducers, estimatedInput, inputSummary.getFileCount()) != null) {
                hasNonLocalJob = true;
                break;
            } else {
                mrtask.setLocalMode(true);
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
    if (!hasNonLocalJob) {
        // Entire query can be run locally.
        // Save the current tracker value and restore it when done.
        ctx.setOriginalTracker(ShimLoader.getHadoopShims().getJobLauncherRpcAddress(conf));
        ShimLoader.getHadoopShims().setJobLauncherRpcAddress(conf, "local");
        console.printInfo("Automatically selecting local only mode for query");
    }
}
Also used : Context(org.apache.hadoop.hive.ql.Context) PhysicalContext(org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext) GenMRProcContext(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext) Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) ContentSummary(org.apache.hadoop.fs.ContentSummary) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) IOException(java.io.IOException)

Example 37 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestUtilities method testGetInputSummaryWithASingleThread.

@Test
public void testGetInputSummaryWithASingleThread() throws IOException {
    final int NUM_PARTITIONS = 5;
    final int BYTES_PER_FILE = 5;
    JobConf jobConf = new JobConf();
    Properties properties = new Properties();
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 38 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestUtilities method testGetInputSummaryWithInputEstimator.

@Test
public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException {
    final int NUM_PARTITIONS = 5;
    final int BYTES_PER_FILE = 10;
    final int NUM_OF_ROWS = 5;
    JobConf jobConf = new JobConf();
    Properties properties = new Properties();
    jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
    properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName());
    InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE));
    /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    // Current getInputSummary() returns -1 for each file found
    assertEquals(NUM_PARTITIONS * -1, summary.getFileCount());
    // Current getInputSummary() returns -1 for each file found
    assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount());
    // Test deprecated mapred.dfsclient.parallelism.max
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
    properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName());
    InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE));
    /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */
    summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    // Current getInputSummary() returns -1 for each file found
    assertEquals(NUM_PARTITIONS * -1, summary.getFileCount());
    // Current getInputSummary() returns -1 for each file found
    assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount());
}
Also used : InputEstimator(org.apache.hadoop.hive.ql.metadata.InputEstimator) ContentSummary(org.apache.hadoop.fs.ContentSummary) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 39 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hadoop by apache.

the class TestChRootedFileSystem method testGetContentSummary.

@Test
public void testGetContentSummary() throws IOException {
    // GetContentSummary of a dir
    fSys.mkdirs(new Path("/newDir/dirFoo"));
    ContentSummary cs = fSys.getContentSummary(new Path("/newDir/dirFoo"));
    Assert.assertEquals(-1L, cs.getQuota());
    Assert.assertEquals(-1L, cs.getSpaceQuota());
}
Also used : Path(org.apache.hadoop.fs.Path) ContentSummary(org.apache.hadoop.fs.ContentSummary) Test(org.junit.Test)

Example 40 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hadoop by apache.

the class BaseTestHttpFSWith method testContentSummary.

private void testContentSummary() throws Exception {
    FileSystem fs = FileSystem.get(getProxiedFSConf());
    Path path = new Path(getProxiedFSTestDir(), "foo.txt");
    OutputStream os = fs.create(path);
    os.write(1);
    os.close();
    ContentSummary hdfsContentSummary = fs.getContentSummary(path);
    fs.close();
    fs = getHttpFSFileSystem();
    ContentSummary httpContentSummary = fs.getContentSummary(path);
    fs.close();
    assertEquals(httpContentSummary.getDirectoryCount(), hdfsContentSummary.getDirectoryCount());
    assertEquals(httpContentSummary.getFileCount(), hdfsContentSummary.getFileCount());
    assertEquals(httpContentSummary.getLength(), hdfsContentSummary.getLength());
    assertEquals(httpContentSummary.getQuota(), hdfsContentSummary.getQuota());
    assertEquals(httpContentSummary.getSpaceConsumed(), hdfsContentSummary.getSpaceConsumed());
    assertEquals(httpContentSummary.getSpaceQuota(), hdfsContentSummary.getSpaceQuota());
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) ContentSummary(org.apache.hadoop.fs.ContentSummary)

Aggregations

ContentSummary (org.apache.hadoop.fs.ContentSummary)61 Path (org.apache.hadoop.fs.Path)42 Test (org.junit.Test)38 FileSystem (org.apache.hadoop.fs.FileSystem)10 IOException (java.io.IOException)9 Configuration (org.apache.hadoop.conf.Configuration)8 ArrayList (java.util.ArrayList)6 OutputStream (java.io.OutputStream)5 URI (java.net.URI)5 DSQuotaExceededException (org.apache.hadoop.hdfs.protocol.DSQuotaExceededException)5 QuotaExceededException (org.apache.hadoop.hdfs.protocol.QuotaExceededException)5 WebHdfsFileSystem (org.apache.hadoop.hdfs.web.WebHdfsFileSystem)5 JobConf (org.apache.hadoop.mapred.JobConf)5 HttpURLConnection (java.net.HttpURLConnection)4 HashMap (java.util.HashMap)4 Properties (java.util.Properties)4 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)4 NSQuotaExceededException (org.apache.hadoop.hdfs.protocol.NSQuotaExceededException)4 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)4 FileNotFoundException (java.io.FileNotFoundException)3