Search in sources :

Example 26 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestSymlinkTextInputFormat method testAccuracy2.

/**
   * Scenario: Empty input directory, i.e. no symlink file.
   *
   * Expected: Should return empty result set without any exception.
   */
public void testAccuracy2() throws IOException {
    fileSystem.mkdirs(symlinkDir);
    FileInputFormat.setInputPaths(job, symlinkDir);
    SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
    ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
    assertEquals(0, cs.getLength());
    assertEquals(0, cs.getFileCount());
    assertEquals(0, cs.getDirectoryCount());
    InputSplit[] splits = inputFormat.getSplits(job, 2);
    log.info("Number of splits: " + splits.length);
    // Read all values.
    List<String> received = new ArrayList<String>();
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter);
        LongWritable key = reader.createKey();
        Text value = reader.createValue();
        while (reader.next(key, value)) {
            received.add(value.toString());
        }
        reader.close();
    }
    List<String> expected = new ArrayList<String>();
    assertEquals(expected, received);
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 27 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestUtilities method testGetInputSummaryWithContentSummaryInputFormat.

@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
    final int NUM_PARTITIONS = 5;
    final int BYTES_PER_FILE = 10;
    JobConf jobConf = new JobConf();
    Properties properties = new Properties();
    jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
    ContentSummaryInputFormatTestClass.setContentSummary(new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
    /* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS * 2, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 28 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestUtilities method testGetInputSummaryWithMultipleThreads.

@Test
public void testGetInputSummaryWithMultipleThreads() throws IOException {
    final int NUM_PARTITIONS = 5;
    final int BYTES_PER_FILE = 5;
    JobConf jobConf = new JobConf();
    Properties properties = new Properties();
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
    // Test deprecated mapred.dfsclient.parallelism.max
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
    jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
    summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 29 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class Utilities method getInputSummary.

/**
   * Calculate the total size of input files.
   *
   * @param ctx
   *          the hadoop job context
   * @param work
   *          map reduce job plan
   * @param filter
   *          filter to apply to the input paths before calculating size
   * @return the summary of all the input paths.
   * @throws IOException
   */
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
    long[] summary = { 0, 0, 0 };
    final Set<Path> pathNeedProcess = new HashSet<>();
    // this method will avoid number of threads out of control.
    synchronized (INPUT_SUMMARY_LOCK) {
        // For each input path, calculate the total size.
        for (Path path : work.getPathToAliases().keySet()) {
            Path p = path;
            if (filter != null && !filter.accept(p)) {
                continue;
            }
            ContentSummary cs = ctx.getCS(path);
            if (cs == null) {
                if (path == null) {
                    continue;
                }
                pathNeedProcess.add(path);
            } else {
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
            }
        }
        // Process the case when name node call is needed
        final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
        ArrayList<Future<?>> results = new ArrayList<Future<?>>();
        final ExecutorService executor;
        int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
        if (numExecutors > 1) {
            LOG.info("Using " + numExecutors + " threads for getContentSummary");
            executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
        } else {
            executor = null;
        }
        HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {

            @Override
            public void interrupt() {
                for (Path path : pathNeedProcess) {
                    try {
                        path.getFileSystem(ctx.getConf()).close();
                    } catch (IOException ignore) {
                        LOG.debug("Failed to close filesystem", ignore);
                    }
                }
                if (executor != null) {
                    executor.shutdownNow();
                }
            }
        });
        try {
            Configuration conf = ctx.getConf();
            JobConf jobConf = new JobConf(conf);
            for (Path path : pathNeedProcess) {
                final Path p = path;
                final String pathStr = path.toString();
                // All threads share the same Configuration and JobConf based on the
                // assumption that they are thread safe if only read operations are
                // executed. It is not stated in Hadoop's javadoc, the sourcce codes
                // clearly showed that they made efforts for it and we believe it is
                // thread safe. Will revisit this piece of codes if we find the assumption
                // is not correct.
                final Configuration myConf = conf;
                final JobConf myJobConf = jobConf;
                final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
                final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
                final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
                Runnable r = new Runnable() {

                    @Override
                    public void run() {
                        try {
                            Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
                            InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
                            if (inputFormatObj instanceof ContentSummaryInputFormat) {
                                ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
                                resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
                                return;
                            }
                            String metaTableStorage = null;
                            if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
                                metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
                            }
                            if (partDesc.getProperties() != null) {
                                metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
                            }
                            HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
                            if (handler instanceof InputEstimator) {
                                long total = 0;
                                TableDesc tableDesc = partDesc.getTableDesc();
                                InputEstimator estimator = (InputEstimator) handler;
                                for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
                                    JobConf jobConf = new JobConf(myJobConf);
                                    TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
                                    Utilities.setColumnNameList(jobConf, scanOp, true);
                                    Utilities.setColumnTypeList(jobConf, scanOp, true);
                                    PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
                                    Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
                                    total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
                                }
                                resultMap.put(pathStr, new ContentSummary(total, -1, -1));
                            } else {
                                // todo: should nullify summary for non-native tables,
                                // not to be selected as a mapjoin target
                                FileSystem fs = p.getFileSystem(myConf);
                                resultMap.put(pathStr, fs.getContentSummary(p));
                            }
                        } catch (Exception e) {
                            // We safely ignore this exception for summary data.
                            // We don't update the cache to protect it from polluting other
                            // usages. The worst case is that IOException will always be
                            // retried for another getInputSummary(), which is fine as
                            // IOException is not considered as a common case.
                            LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
                        }
                    }
                };
                if (executor == null) {
                    r.run();
                } else {
                    Future<?> result = executor.submit(r);
                    results.add(result);
                }
            }
            if (executor != null) {
                for (Future<?> result : results) {
                    boolean executorDone = false;
                    do {
                        try {
                            result.get();
                            executorDone = true;
                        } catch (InterruptedException e) {
                            LOG.info("Interrupted when waiting threads: ", e);
                            Thread.currentThread().interrupt();
                            break;
                        } catch (ExecutionException e) {
                            throw new IOException(e);
                        }
                    } while (!executorDone);
                }
                executor.shutdown();
            }
            HiveInterruptUtils.checkInterrupted();
            for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
                ContentSummary cs = entry.getValue();
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
                ctx.addCS(entry.getKey(), cs);
                LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
            }
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
            return new ContentSummary(summary[0], summary[1], summary[2]);
        } finally {
            HiveInterruptUtils.remove(interrup);
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ExecutionException(java.util.concurrent.ExecutionException) JobConf(org.apache.hadoop.mapred.JobConf) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputEstimator(org.apache.hadoop.hive.ql.metadata.InputEstimator) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveInterruptCallback(org.apache.hadoop.hive.common.HiveInterruptCallback) IOException(java.io.IOException) SQLFeatureNotSupportedException(java.sql.SQLFeatureNotSupportedException) SQLTransientException(java.sql.SQLTransientException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ContentSummary(org.apache.hadoop.fs.ContentSummary) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Example 30 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class CorrelationOptimizer method findPossibleAutoConvertedJoinOperators.

private void findPossibleAutoConvertedJoinOperators() throws SemanticException {
    // based on hive.auto.convert.join.noconditionaltask.size.
    for (JoinOperator joinOp : pCtx.getJoinOps()) {
        boolean isAbleToGuess = true;
        boolean mayConvert = false;
        // Get total size and individual alias's size
        long aliasTotalKnownInputSize = 0;
        Map<String, Long> aliasToSize = new HashMap<String, Long>();
        Map<Integer, Set<String>> posToAliases = new HashMap<Integer, Set<String>>();
        for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
            Operator<? extends OperatorDesc> op = joinOp.getParentOperators().get(pos);
            Set<TableScanOperator> topOps = CorrelationUtilities.findTableScanOperators(op);
            if (topOps.isEmpty()) {
                isAbleToGuess = false;
                break;
            }
            Set<String> aliases = new LinkedHashSet<String>();
            for (TableScanOperator tsop : topOps) {
                Table table = tsop.getConf().getTableMetadata();
                if (table == null) {
                    // table should not be null.
                    throw new SemanticException("The table of " + tsop.getName() + " " + tsop.getIdentifier() + " is null, which is not expected.");
                }
                String alias = tsop.getConf().getAlias();
                aliases.add(alias);
                Path p = table.getPath();
                ContentSummary resultCs = null;
                try {
                    FileSystem fs = table.getPath().getFileSystem(pCtx.getConf());
                    resultCs = fs.getContentSummary(p);
                } catch (IOException e) {
                    LOG.warn("Encounter a error while querying content summary of table " + table.getCompleteName() + " from FileSystem. " + "Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
                }
                if (resultCs == null) {
                    isAbleToGuess = false;
                    break;
                }
                long size = resultCs.getLength();
                aliasTotalKnownInputSize += size;
                Long es = aliasToSize.get(alias);
                if (es == null) {
                    es = new Long(0);
                }
                es += size;
                aliasToSize.put(alias, es);
            }
            posToAliases.put(pos, aliases);
        }
        if (!isAbleToGuess) {
            LOG.info("Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
            continue;
        }
        JoinDesc joinDesc = joinOp.getConf();
        Byte[] order = joinDesc.getTagOrder();
        int numAliases = order.length;
        Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
        if (bigTableCandidates.isEmpty()) {
            continue;
        }
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(pCtx.getConf(), HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int i = 0; i < numAliases; i++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(i)) {
                continue;
            }
            Set<String> aliases = posToAliases.get(i);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (!CommonJoinTaskDispatcher.cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
                mayConvert = true;
            }
        }
        if (mayConvert) {
            LOG.info(joinOp.getName() + " " + joinOp.getIdentifier() + " may be converted to MapJoin by CommonJoinResolver");
            skipedJoinOperators.add(joinOp);
        }
    }
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) LinkedHashSet(java.util.LinkedHashSet) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Set(java.util.Set) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileSystem(org.apache.hadoop.fs.FileSystem) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hive.ql.metadata.Table) IOException(java.io.IOException) ContentSummary(org.apache.hadoop.fs.ContentSummary) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Aggregations

ContentSummary (org.apache.hadoop.fs.ContentSummary)61 Path (org.apache.hadoop.fs.Path)42 Test (org.junit.Test)38 FileSystem (org.apache.hadoop.fs.FileSystem)10 IOException (java.io.IOException)9 Configuration (org.apache.hadoop.conf.Configuration)8 ArrayList (java.util.ArrayList)6 OutputStream (java.io.OutputStream)5 URI (java.net.URI)5 DSQuotaExceededException (org.apache.hadoop.hdfs.protocol.DSQuotaExceededException)5 QuotaExceededException (org.apache.hadoop.hdfs.protocol.QuotaExceededException)5 WebHdfsFileSystem (org.apache.hadoop.hdfs.web.WebHdfsFileSystem)5 JobConf (org.apache.hadoop.mapred.JobConf)5 HttpURLConnection (java.net.HttpURLConnection)4 HashMap (java.util.HashMap)4 Properties (java.util.Properties)4 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)4 NSQuotaExceededException (org.apache.hadoop.hdfs.protocol.NSQuotaExceededException)4 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)4 FileNotFoundException (java.io.FileNotFoundException)3