Examples with InputFormat - org.apache.hadoop.mapred.InputFormat

Example 6 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class FetchOperator method getNextSplits.

protected FetchInputFormatSplit[] getNextSplits() throws Exception {
    while (getNextPath()) {
        // not using FileInputFormat.setInputPaths() here because it forces a connection to the
        // default file system - which may or may not be online during pure metadata operations
        job.set("mapred.input.dir", StringUtils.escapeString(currPath.toString()));
        // Fetch operator is not vectorized and as such turn vectorization flag off so that
        // non-vectorized record reader is created below.
        HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
        Class<? extends InputFormat> formatter = currDesc.getInputFileFormatClass();
        Utilities.copyTableJobPropertiesToConf(currDesc.getTableDesc(), job);
        InputFormat inputFormat = getInputFormatFromCache(formatter, job);
        InputSplit[] splits = inputFormat.getSplits(job, 1);
        FetchInputFormatSplit[] inputSplits = new FetchInputFormatSplit[splits.length];
        for (int i = 0; i < splits.length; i++) {
            inputSplits[i] = new FetchInputFormatSplit(splits[i], inputFormat);
        }
        if (work.getSplitSample() != null) {
            inputSplits = splitSampling(work.getSplitSample(), inputSplits);
        }
        if (inputSplits.length > 0) {
            return inputSplits;
        }
    }
    return null;
}

Also used : InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 7 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class Utilities method getInputSummary.

/**
   * Calculate the total size of input files.
   *
   * @param ctx
   *          the hadoop job context
   * @param work
   *          map reduce job plan
   * @param filter
   *          filter to apply to the input paths before calculating size
   * @return the summary of all the input paths.
   * @throws IOException
   */
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
    long[] summary = { 0, 0, 0 };
    final Set<Path> pathNeedProcess = new HashSet<>();
    // this method will avoid number of threads out of control.
    synchronized (INPUT_SUMMARY_LOCK) {
        // For each input path, calculate the total size.
        for (Path path : work.getPathToAliases().keySet()) {
            Path p = path;
            if (filter != null && !filter.accept(p)) {
                continue;
            }
            ContentSummary cs = ctx.getCS(path);
            if (cs == null) {
                if (path == null) {
                    continue;
                }
                pathNeedProcess.add(path);
            } else {
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
            }
        }
        // Process the case when name node call is needed
        final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
        ArrayList<Future<?>> results = new ArrayList<Future<?>>();
        final ExecutorService executor;
        int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
        if (numExecutors > 1) {
            LOG.info("Using " + numExecutors + " threads for getContentSummary");
            executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
        } else {
            executor = null;
        }
        HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {

            @Override
            public void interrupt() {
                for (Path path : pathNeedProcess) {
                    try {
                        path.getFileSystem(ctx.getConf()).close();
                    } catch (IOException ignore) {
                        LOG.debug("Failed to close filesystem", ignore);
                    }
                }
                if (executor != null) {
                    executor.shutdownNow();
                }
            }
        });
        try {
            Configuration conf = ctx.getConf();
            JobConf jobConf = new JobConf(conf);
            for (Path path : pathNeedProcess) {
                final Path p = path;
                final String pathStr = path.toString();
                // All threads share the same Configuration and JobConf based on the
                // assumption that they are thread safe if only read operations are
                // executed. It is not stated in Hadoop's javadoc, the sourcce codes
                // clearly showed that they made efforts for it and we believe it is
                // thread safe. Will revisit this piece of codes if we find the assumption
                // is not correct.
                final Configuration myConf = conf;
                final JobConf myJobConf = jobConf;
                final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
                final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
                final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
                Runnable r = new Runnable() {

                    @Override
                    public void run() {
                        try {
                            Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
                            InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
                            if (inputFormatObj instanceof ContentSummaryInputFormat) {
                                ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
                                resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
                                return;
                            }
                            String metaTableStorage = null;
                            if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
                                metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
                            }
                            if (partDesc.getProperties() != null) {
                                metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
                            }
                            HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
                            if (handler instanceof InputEstimator) {
                                long total = 0;
                                TableDesc tableDesc = partDesc.getTableDesc();
                                InputEstimator estimator = (InputEstimator) handler;
                                for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
                                    JobConf jobConf = new JobConf(myJobConf);
                                    TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
                                    Utilities.setColumnNameList(jobConf, scanOp, true);
                                    Utilities.setColumnTypeList(jobConf, scanOp, true);
                                    PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
                                    Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
                                    total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
                                }
                                resultMap.put(pathStr, new ContentSummary(total, -1, -1));
                            } else {
                                // todo: should nullify summary for non-native tables,
                                // not to be selected as a mapjoin target
                                FileSystem fs = p.getFileSystem(myConf);
                                resultMap.put(pathStr, fs.getContentSummary(p));
                            }
                        } catch (Exception e) {
                            // We safely ignore this exception for summary data.
                            // We don't update the cache to protect it from polluting other
                            // usages. The worst case is that IOException will always be
                            // retried for another getInputSummary(), which is fine as
                            // IOException is not considered as a common case.
                            LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
                        }
                    }
                };
                if (executor == null) {
                    r.run();
                } else {
                    Future<?> result = executor.submit(r);
                    results.add(result);
                }
            }
            if (executor != null) {
                for (Future<?> result : results) {
                    boolean executorDone = false;
                    do {
                        try {
                            result.get();
                            executorDone = true;
                        } catch (InterruptedException e) {
                            LOG.info("Interrupted when waiting threads: ", e);
                            Thread.currentThread().interrupt();
                            break;
                        } catch (ExecutionException e) {
                            throw new IOException(e);
                        }
                    } while (!executorDone);
                }
                executor.shutdown();
            }
            HiveInterruptUtils.checkInterrupted();
            for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
                ContentSummary cs = entry.getValue();
                summary[0] += cs.getLength();
                summary[1] += cs.getFileCount();
                summary[2] += cs.getDirectoryCount();
                ctx.addCS(entry.getKey(), cs);
                LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
            }
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
            return new ContentSummary(summary[0], summary[1], summary[2]);
        } finally {
            HiveInterruptUtils.remove(interrup);
        }
    }
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ExecutionException(java.util.concurrent.ExecutionException) JobConf(org.apache.hadoop.mapred.JobConf) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputEstimator(org.apache.hadoop.hive.ql.metadata.InputEstimator) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveInterruptCallback(org.apache.hadoop.hive.common.HiveInterruptCallback) IOException(java.io.IOException) SQLFeatureNotSupportedException(java.sql.SQLFeatureNotSupportedException) SQLTransientException(java.sql.SQLTransientException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ContentSummary(org.apache.hadoop.fs.ContentSummary) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Example 8 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class HiveFileFormatUtils method checkTextInputFormat.

@SuppressWarnings("unchecked")
private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf, List<FileStatus> files) throws HiveException {
    List<FileStatus> files2 = new LinkedList<>(files);
    Iterator<FileStatus> iter = files2.iterator();
    while (iter.hasNext()) {
        FileStatus file = iter.next();
        if (file == null)
            continue;
        if (isPipe(fs, file)) {
            LOG.info("Skipping format check for " + file.getPath() + " as it is a pipe");
            iter.remove();
        }
    }
    if (files2.isEmpty())
        return true;
    Set<Class<? extends InputFormat>> inputFormatter = FileChecker.getInstance().registeredClasses();
    for (Class<? extends InputFormat> reg : inputFormatter) {
        boolean result = checkInputFormat(fs, conf, reg, files2);
        if (result) {
            return false;
        }
    }
    return true;
}

Also used : FileStatus(org.apache.hadoop.fs.FileStatus) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LinkedList(java.util.LinkedList)

Example 9 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class HiveInputFormat method getRecordReader.

public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    HiveInputSplit hsplit = (HiveInputSplit) split;
    InputSplit inputSplit = hsplit.getInputSplit();
    String inputFormatClassName = null;
    Class inputFormatClass = null;
    try {
        inputFormatClassName = hsplit.inputFormatClassName();
        inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
        throw new IOException("cannot find class " + inputFormatClassName, e);
    }
    if (this.mrwork == null || pathToPartitionInfo == null) {
        init(job);
    }
    boolean nonNative = false;
    PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
    }
    if ((part != null) && (part.getTableDesc() != null)) {
        Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
        nonNative = part.getTableDesc().isNonNative();
    }
    Path splitPath = hsplit.getPath();
    pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    try {
        inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
    } catch (HiveException e) {
        throw new IOException(e);
    }
    RecordReader innerReader = null;
    try {
        innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
        innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
    rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
    return rr;
}

Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordReader(org.apache.hadoop.mapred.RecordReader) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 10 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class BucketizedHiveInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    init(job);
    Path[] dirs = getInputPaths(job);
    JobConf newjob = new JobConf(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    int numOrigSplits = 0;
    // and then create a BucketizedHiveInputSplit on it
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        // create a new InputFormat instance if this is the first time to see this
        // class
        Class inputFormatClass = part.getInputFileFormatClass();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        newjob.setInputFormat(inputFormat.getClass());
        FileStatus[] listStatus = listStatus(newjob, dir);
        for (FileStatus status : listStatus) {
            LOG.info("block size: " + status.getBlockSize());
            LOG.info("file length: " + status.getLen());
            FileInputFormat.setInputPaths(newjob, status.getPath());
            InputSplit[] iss = inputFormat.getSplits(newjob, 0);
            if (iss != null && iss.length > 0) {
                numOrigSplits += iss.length;
                result.add(new BucketizedHiveInputSplit(iss, inputFormatClass.getName()));
            }
        }
    }
    LOG.info(result.size() + " bucketized splits generated from " + numOrigSplits + " original splits.");
    return result.toArray(new BucketizedHiveInputSplit[result.size()]);
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputFormat (org.apache.hadoop.mapred.InputFormat)34 Path (org.apache.hadoop.fs.Path)23 JobConf (org.apache.hadoop.mapred.JobConf)20 InputSplit (org.apache.hadoop.mapred.InputSplit)19 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)15 IOException (java.io.IOException)11 FileSystem (org.apache.hadoop.fs.FileSystem)8 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Configuration (org.apache.hadoop.conf.Configuration)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 Mapper (org.apache.hadoop.mapred.Mapper)5 Test (org.junit.Test)5