Examples with InputFormat - org.apache.hadoop.mapred.InputFormat

Example 11 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class CombineHiveInputFormat method getCombineSplits.

/**
   * Create Hive splits based on CombineFileSplit.
   */
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
    InputSplit[] splits = null;
    if (combine == null) {
        splits = super.getSplits(job, numSplits);
        return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
        throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    for (Path path : paths) {
        PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
        TableDesc tableDesc = part.getTableDesc();
        if ((tableDesc != null) && tableDesc.isNonNative()) {
            return super.getSplits(job, numSplits);
        }
        // Use HiveInputFormat if any of the paths is not splittable
        Class inputFormatClass = part.getInputFileFormatClass();
        String inputFormatClassName = inputFormatClass.getName();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        String deserializerClassName = null;
        try {
            deserializerClassName = part.getDeserializer(job).getClass().getName();
        } catch (Exception e) {
        // ignore
        }
        FileSystem inpFs = path.getFileSystem(job);
        //don't combine if inputformat is a SymlinkTextInputFormat
        if (inputFormat instanceof SymlinkTextInputFormat) {
            splits = super.getSplits(job, numSplits);
            return splits;
        }
        Path filterPath = path;
        // Does a pool exist for this path already
        CombineFilter f = null;
        List<Operator<? extends OperatorDesc>> opList = null;
        if (!mrwork.isMapperCannotSpanPartns()) {
            //if mapper can span partitions, make sure a splits does not contain multiple
            // opList + inputFormatClassName + deserializerClassName combination
            // This is done using the Map of CombinePathInputFormat to PathFilter
            opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
            CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
            f = poolMap.get(combinePathInputFormat);
            if (f == null) {
                f = new CombineFilter(filterPath);
                LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
                combine.createPool(job, f);
                poolMap.put(combinePathInputFormat, f);
            } else {
                LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
                f.addPath(filterPath);
            }
        } else {
            // but won't cross multiple partitions if the user has asked so.
            if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
                // path is not directory
                filterPath = path.getParent();
                inpFiles.add(path);
                poolSet.add(filterPath);
            } else {
                inpDirs.add(path);
            }
        }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
        //mapper can span partitions
        //combine into as few as one split, subject to the PathFilters set
        // using combine.createPool.
        iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
        for (Path path : inpDirs) {
            processPaths(job, combine, iss, path);
        }
        if (inpFiles.size() > 0) {
            // Processing files
            for (Path filterPath : poolSet) {
                combine.createPool(job, new CombineFilter(filterPath));
            }
            processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
        }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
        iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
        CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
        result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new CombineHiveInputSplit[result.size()]);
}

Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) CombineFileInputFormatShim(org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 12 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project SQLWindowing by hbutani.

the class IOUtils method createFileWindowingInput.

@SuppressWarnings("unchecked")
public static WindowingInput createFileWindowingInput(String path, String inputFormatClassName, String serDeClassName, Properties serDeProperties, Configuration conf) throws WindowingException {
    try {
        HiveConf hConf = new HiveConf(conf, IOUtils.class);
        JobConf job = new JobConf(hConf);
        Path p = new Path(path);
        p = makeQualified(p, conf);
        Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(inputFormatClassName);
        hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
        hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
        InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
        if (iFmt instanceof TextInputFormat) {
            ((TextInputFormat) iFmt).configure(job);
        }
        FileInputFormat.addInputPath(job, p);
        InputSplit[] iSplits = iFmt.getSplits(job, 1);
        org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
        hConf.set(INPUT_PATH, path);
        hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
        hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
        hConf.set(INPUT_SERDE_CLASS, serDeClassName);
        TableWindowingInput tIn = new TableWindowingInput();
        tIn.initialize(null, hConf, serDeProperties);
        return tIn;
    } catch (Exception e) {
        throw new WindowingException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) WindowingException(com.sap.hadoop.windowing.WindowingException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) WindowingException(com.sap.hadoop.windowing.WindowingException) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 13 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project SQLWindowing by hbutani.

the class HiveUtils method addTableasJobInput.

@SuppressWarnings("unchecked")
public static List<FieldSchema> addTableasJobInput(String db, String table, JobConf job, FileSystem fs) throws WindowingException {
    LOG.info("HiveUtils::addTableasJobInput invoked");
    try {
        HiveMetaStoreClient client = getClient(job);
        // 1. get Table details from Hive metastore
        db = validateDB(client, db);
        Table t = getTable(client, db, table);
        StorageDescriptor sd = t.getSd();
        // 2. add table's location to job input
        FileInputFormat.addInputPath(job, new Path(sd.getLocation()));
        // 3. set job inputFormatClass, extract from StorageDescriptor
        Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(sd.getInputFormat());
        job.setInputFormat(inputFormatClass);
        return client.getFields(db, table);
    } catch (WindowingException w) {
        throw w;
    } catch (Exception e) {
        throw new WindowingException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) Table(org.apache.hadoop.hive.metastore.api.Table) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) WindowingException(com.sap.hadoop.windowing.WindowingException) Writable(org.apache.hadoop.io.Writable) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) WindowingException(com.sap.hadoop.windowing.WindowingException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 14 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project asterixdb by apache.

the class HDFSReadOperatorDescriptor method createPushRuntime.

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException {
    final InputSplit[] inputSplits = splitsFactory.getSplits();
    return new AbstractUnaryOutputSourceOperatorNodePushable() {

        private String nodeName = ctx.getJobletContext().getServiceContext().getNodeId();

        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                JobConf conf = confFactory.getConf();
                conf.setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                try {
                    parser.open(writer);
                    InputFormat inputFormat = conf.getInputFormat();
                    for (int i = 0; i < inputSplits.length; i++) {
                        /**
                             * read all the partitions scheduled to the current node
                             */
                        if (scheduledLocations[i].equals(nodeName)) {
                            /**
                                 * pick an unread split to read
                                 * synchronize among simultaneous partitions in the same machine
                                 */
                            synchronized (executed) {
                                if (executed[i] == false) {
                                    executed[i] = true;
                                } else {
                                    continue;
                                }
                            }
                            /**
                                 * read the split
                                 */
                            RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
                            Object key = reader.createKey();
                            Object value = reader.createValue();
                            while (reader.next(key, value) == true) {
                                parser.parse(key, value, writer, inputSplits[i].toString());
                            }
                        }
                    }
                } finally {
                    parser.close(writer);
                }
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

Also used : AbstractUnaryOutputSourceOperatorNodePushable(org.apache.hyracks.dataflow.std.base.AbstractUnaryOutputSourceOperatorNodePushable) RecordReader(org.apache.hadoop.mapred.RecordReader) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IKeyValueParser(org.apache.hyracks.hdfs.api.IKeyValueParser) InputFormat(org.apache.hadoop.mapred.InputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 15 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project drill by apache.

the class HiveUtilities method getInputFormatClass.

/**
   * Utility method which gets table or partition {@link InputFormat} class. First it
   * tries to get the class name from given StorageDescriptor object. If it doesn't contain it tries to get it from
   * StorageHandler class set in table properties. If not found throws an exception.
   * @param job {@link JobConf} instance needed incase the table is StorageHandler based table.
   * @param sd {@link StorageDescriptor} instance of currently reading partition or table (for non-partitioned tables).
   * @param table Table object
   * @throws Exception
   */
public static Class<? extends InputFormat<?, ?>> getInputFormatClass(final JobConf job, final StorageDescriptor sd, final Table table) throws Exception {
    final String inputFormatName = sd.getInputFormat();
    if (Strings.isNullOrEmpty(inputFormatName)) {
        final String storageHandlerClass = table.getParameters().get(META_TABLE_STORAGE);
        if (Strings.isNullOrEmpty(storageHandlerClass)) {
            throw new ExecutionSetupException("Unable to get Hive table InputFormat class. There is neither " + "InputFormat class explicitly specified nor StorageHandler class");
        }
        final HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(job, storageHandlerClass);
        return (Class<? extends InputFormat<?, ?>>) storageHandler.getInputFormatClass();
    } else {
        return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName);
    }
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) InputFormat(org.apache.hadoop.mapred.InputFormat)

Aggregations

InputFormat (org.apache.hadoop.mapred.InputFormat)34 Path (org.apache.hadoop.fs.Path)23 JobConf (org.apache.hadoop.mapred.JobConf)20 InputSplit (org.apache.hadoop.mapred.InputSplit)19 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)15 IOException (java.io.IOException)11 FileSystem (org.apache.hadoop.fs.FileSystem)8 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Configuration (org.apache.hadoop.conf.Configuration)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 Mapper (org.apache.hadoop.mapred.Mapper)5 Test (org.junit.Test)5