Search in sources :

Example 76 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestGetInputSummary method testGetInputSummaryWithContentSummaryInputFormat.

@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
    final int BYTES_PER_FILE = 10;
    final Collection<Path> testPaths = Arrays.asList(new Path("p1/test.txt"), new Path("p2/test.txt"), new Path("p3/test.txt"), new Path("p4/test.txt"), new Path("p5/test.txt"));
    jobConf.setInt(ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
    ContentSummaryInputFormatTestClass.setContentSummary(new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
    /*
     * Write more bytes to the files to test that ContentSummaryInputFormat is
     * actually working returning the file size not from the filesystem
     */
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, testPaths, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class, Collections.emptyMap());
    assertEquals(testPaths.size() * BYTES_PER_FILE, summary.getLength());
    assertEquals(testPaths.size() * 2, summary.getFileCount());
    assertEquals(testPaths.size(), summary.getDirectoryCount());
}
Also used : Path(org.apache.hadoop.fs.Path) ContentSummary(org.apache.hadoop.fs.ContentSummary) Test(org.junit.Test)

Example 77 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestGetInputSummary method testGetInputSummaryWithASingleThread.

@Test
public void testGetInputSummaryWithASingleThread() throws IOException {
    final int BYTES_PER_FILE = 5;
    // Set to zero threads to disable thread pool
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
    final Collection<Path> testPaths = Arrays.asList(new Path("p1/test.txt"), new Path("p2/test.txt"), new Path("p3/test.txt"), new Path("p4/test.txt"), new Path("p5/test.txt"));
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, testPaths, BYTES_PER_FILE, HiveInputFormat.class, Collections.emptyMap());
    assertEquals(testPaths.size() * BYTES_PER_FILE, summary.getLength());
    assertEquals(testPaths.size(), summary.getFileCount());
    assertEquals(testPaths.size(), summary.getDirectoryCount());
}
Also used : Path(org.apache.hadoop.fs.Path) ContentSummary(org.apache.hadoop.fs.ContentSummary) Test(org.junit.Test)

Example 78 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class CopyUtils method regularCopy.

/*
      Check for conditions that will lead to local copy, checks are:
      1. we are testing hive.
      2. either source or destination is a "local" FileSystem("file")
      3. aggregate fileSize of all source Paths(can be directory /  file) is less than configured size.
      4. number of files of all source Paths(can be directory /  file) is less than configured size.
  */
boolean regularCopy(FileSystem sourceFs, List<ReplChangeManager.FileInfo> fileList) throws IOException {
    if (hiveInReplTest) {
        return true;
    }
    if (isLocal(sourceFs) || isLocal(destinationFs)) {
        return true;
    }
    /*
       we have reached the point where we are transferring files across fileSystems.
    */
    long size = 0;
    long numberOfFiles = 0;
    for (ReplChangeManager.FileInfo fileInfo : fileList) {
        ContentSummary contentSummary = null;
        try {
            contentSummary = getContentSummary(sourceFs, fileInfo.getEffectivePath());
        } catch (IOException e) {
            // In replication, if source file does not exist, try cmroot
            if (fileInfo.isUseSourcePath() && fileInfo.getCmPath() != null) {
                contentSummary = getContentSummary(sourceFs, fileInfo.getCmPath());
                fileInfo.setIsUseSourcePath(false);
            }
        }
        if (contentSummary != null) {
            size += contentSummary.getLength();
            numberOfFiles += contentSummary.getFileCount();
            if (limitReachedForLocalCopy(size, numberOfFiles)) {
                return false;
            }
        }
    }
    return true;
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) IOException(java.io.IOException) ReplChangeManager(org.apache.hadoop.hive.metastore.ReplChangeManager)

Example 79 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class QueryResultsCache method calculateEntrySize.

private void calculateEntrySize(CacheEntry entry, FetchWork fetchWork) throws IOException {
    Path queryResultsPath = fetchWork.getTblDir();
    FileSystem resultsFs = queryResultsPath.getFileSystem(conf);
    ContentSummary cs = resultsFs.getContentSummary(queryResultsPath);
    entry.size = cs.getLength();
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) ContentSummary(org.apache.hadoop.fs.ContentSummary)

Example 80 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class MapReduceCompiler method decideExecMode.

@Override
protected void decideExecMode(List<Task<?>> rootTasks, Context ctx, GlobalLimitCtx globalLimitCtx) throws SemanticException {
    // bypass for explain queries for now
    if (ctx.isExplainSkipExecution()) {
        return;
    }
    // user has told us to run in local mode or doesn't want auto-local mode
    if (ctx.isLocalOnlyExecutionMode() || !conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
        return;
    }
    final Context lCtx = ctx;
    PathFilter p = new PathFilter() {

        @Override
        public boolean accept(Path file) {
            return !lCtx.isMRTmpFileURI(file.toUri().getPath());
        }
    };
    List<ExecDriver> mrtasks = Utilities.getMRTasks(rootTasks);
    // map-reduce jobs will be run locally based on data size
    // first find out if any of the jobs needs to run non-locally
    boolean hasNonLocalJob = false;
    for (ExecDriver mrtask : mrtasks) {
        try {
            ContentSummary inputSummary = Utilities.getInputSummary(ctx, mrtask.getWork().getMapWork(), p);
            int numReducers = getNumberOfReducers(mrtask.getWork(), conf);
            long estimatedInput;
            if (globalLimitCtx != null && globalLimitCtx.isEnable()) {
                // If the global limit optimization is triggered, we will
                // estimate input data actually needed based on limit rows.
                // estimated Input = (num_limit * max_size_per_row) * (estimated_map + 2)
                // 
                long sizePerRow = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
                estimatedInput = (globalLimitCtx.getGlobalOffset() + globalLimitCtx.getGlobalLimit()) * sizePerRow;
                long minSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE);
                long estimatedNumMap = inputSummary.getLength() / minSplitSize + 1;
                estimatedInput = estimatedInput * (estimatedNumMap + 1);
            } else {
                estimatedInput = inputSummary.getLength();
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Task: " + mrtask.getId() + ", Summary: " + inputSummary.getLength() + "," + inputSummary.getFileCount() + "," + numReducers + ", estimated Input: " + estimatedInput);
            }
            if (MapRedTask.isEligibleForLocalMode(conf, numReducers, estimatedInput, inputSummary.getFileCount()) != null) {
                hasNonLocalJob = true;
                break;
            } else {
                mrtask.setLocalMode(true);
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
    if (!hasNonLocalJob) {
        // Entire query can be run locally.
        // Save the current tracker value and restore it when done.
        ctx.setOriginalTracker(ShimLoader.getHadoopShims().getJobLauncherRpcAddress(conf));
        ShimLoader.getHadoopShims().setJobLauncherRpcAddress(conf, "local");
        console.printInfo("Automatically selecting local only mode for query");
    }
}
Also used : Context(org.apache.hadoop.hive.ql.Context) PhysicalContext(org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext) GenMRProcContext(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext) Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) ContentSummary(org.apache.hadoop.fs.ContentSummary) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) IOException(java.io.IOException)

Aggregations

ContentSummary (org.apache.hadoop.fs.ContentSummary)84 Path (org.apache.hadoop.fs.Path)60 Test (org.junit.Test)52 FileSystem (org.apache.hadoop.fs.FileSystem)21 IOException (java.io.IOException)13 Configuration (org.apache.hadoop.conf.Configuration)9 ArrayList (java.util.ArrayList)6 URI (java.net.URI)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)5 DSQuotaExceededException (org.apache.hadoop.hdfs.protocol.DSQuotaExceededException)5 QuotaExceededException (org.apache.hadoop.hdfs.protocol.QuotaExceededException)5 WebHdfsFileSystem (org.apache.hadoop.hdfs.web.WebHdfsFileSystem)5 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)5 JobConf (org.apache.hadoop.mapred.JobConf)5 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)5 OutputStream (java.io.OutputStream)4 HttpURLConnection (java.net.HttpURLConnection)4 List (java.util.List)4 Properties (java.util.Properties)4