Examples with TextInputFormat - org.apache.hadoop.mapred.TextInputFormat

Example 1 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project flink by apache.

the class WordCountMapredITCase method internalRun.

private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<LongWritable, Text>> input;
    if (isTestDeprecatedAPI) {
        input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
    } else {
        input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    }
    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {

        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });
    DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).groupBy(0).sum(1);
    DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

        @Override
        public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
            return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
        }
    });
    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));
    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}

Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Tokenizer(org.apache.flink.test.testfunctions.Tokenizer) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project SQLWindowing by hbutani.

the class IOUtils method createFileWindowingInput.

@SuppressWarnings("unchecked")
public static WindowingInput createFileWindowingInput(String path, String inputFormatClassName, String serDeClassName, Properties serDeProperties, Configuration conf) throws WindowingException {
    try {
        HiveConf hConf = new HiveConf(conf, IOUtils.class);
        JobConf job = new JobConf(hConf);
        Path p = new Path(path);
        p = makeQualified(p, conf);
        Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(inputFormatClassName);
        hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
        hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
        InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
        if (iFmt instanceof TextInputFormat) {
            ((TextInputFormat) iFmt).configure(job);
        }
        FileInputFormat.addInputPath(job, p);
        InputSplit[] iSplits = iFmt.getSplits(job, 1);
        org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
        hConf.set(INPUT_PATH, path);
        hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
        hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
        hConf.set(INPUT_SERDE_CLASS, serDeClassName);
        TableWindowingInput tIn = new TableWindowingInput();
        tIn.initialize(null, hConf, serDeProperties);
        return tIn;
    } catch (Exception e) {
        throw new WindowingException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) WindowingException(com.sap.hadoop.windowing.WindowingException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) WindowingException(com.sap.hadoop.windowing.WindowingException) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 3 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project SQLWindowing by hbutani.

the class TableWindowingInput method initialize.

@SuppressWarnings("unchecked")
@Override
public void initialize(InputStream ins, Configuration conf, Properties tbl) throws IOException {
    try {
        setupSerDe(conf, tbl);
        tableDirPath = new Path(conf.get(INPUT_PATH));
        keyClass = (Class<? extends Writable>) Class.forName(conf.get(INPUT_KEY_CLASS));
        valueClass = (Class<? extends Writable>) Class.forName(conf.get(INPUT_VALUE_CLASS));
        inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(conf.get(INPUT_INPUTFORMAT_CLASS));
        FileSystem fs = FileSystem.get(tableDirPath.toUri(), conf);
        FileStatus tableDir = fs.getFileStatus(tableDirPath);
        assert tableDir.isDir();
        tableFiles = fs.listStatus(tableDirPath);
        jconf = new JobConf();
        iFmt = inputFormatClass.newInstance();
        if (iFmt instanceof TextInputFormat)
            ((TextInputFormat) iFmt).configure(jconf);
        jconf.set("mapred.input.dir", tableDirPath.toString());
        currSplitNum = 0;
        iSplits = iFmt.getSplits(jconf, 1);
        nextSplit();
    } catch (Exception e) {
        throw new IOException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) IOException(java.io.IOException)

Example 4 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project presto by prestodb.

the class BackgroundHiveSplitLoader method loadPartition.

private void loadPartition(HivePartitionMetadata partition) throws IOException {
    String partitionName = partition.getHivePartition().getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (bucketHandle.isPresent()) {
            throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }
        // TODO: This should use an iterator like the HiveFileIterator
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // get the configuration for the target path -- it may be a different hdfs instance
            Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
            JobConf targetJob = new JobConf(targetConfiguration);
            targetJob.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(targetJob);
            FileInputFormat.setInputPaths(targetJob, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
            if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
                return;
            }
        }
        return;
    }
    // on the input format to obtain file splits.
    if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
        JobConf jobConf = new JobConf(configuration);
        FileInputFormat.setInputPaths(jobConf, path);
        InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
        addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
        return;
    }
    // If only one bucket could match: load that one file
    HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
    if (!buckets.isEmpty()) {
        int bucketCount = buckets.get(0).getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (HiveBucket bucket : buckets) {
            int bucketNumber = bucket.getBucketNumber();
            LocatedFileStatus file = list.get(bucketNumber);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    // If table is bucketed: list the directory, sort, tag with bucket id
    if (bucketHandle.isPresent()) {
        // HiveFileIterator skips hidden files automatically.
        int bucketCount = bucketHandle.get().getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
            LocatedFileStatus file = list.get(bucketIndex);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    fileIterators.addLast(iterator);
}

Also used : HiveBucket(com.facebook.presto.hive.HiveBucketing.HiveBucket) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) PrestoException(com.facebook.presto.spi.PrestoException) Properties(java.util.Properties) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) PeekingIterator(com.google.common.collect.PeekingIterator) Iterator(java.util.Iterator) AbstractIterator(com.google.common.collect.AbstractIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat)

Example 5 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project hive by apache.

the class VectorDeserializeOrcWriter method create.

// TODO: if more writers are added, separate out an EncodingWriterFactory
public static EncodingWriter create(InputFormat<?, ?> sourceIf, Deserializer serDe, Map<Path, PartitionDesc> parts, Configuration daemonConf, Configuration jobConf, Path splitPath, StructObjectInspector sourceOi, List<Integer> sourceIncludes, boolean[] cacheIncludes, int allocSize) throws IOException {
    // Vector SerDe can be disabled both on client and server side.
    if (!HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_IO_ENCODE_VECTOR_SERDE_ENABLED) || !HiveConf.getBoolVar(jobConf, ConfVars.LLAP_IO_ENCODE_VECTOR_SERDE_ENABLED) || !(sourceIf instanceof TextInputFormat) || !(serDe instanceof LazySimpleSerDe)) {
        return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
    }
    Path path = splitPath.getFileSystem(daemonConf).makeQualified(splitPath);
    PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
    if (partDesc == null) {
        LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter: no partition desc for " + path);
        return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
    }
    Properties tblProps = partDesc.getTableDesc().getProperties();
    if ("true".equalsIgnoreCase(tblProps.getProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST))) {
        LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter due to " + serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST);
        return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
    }
    for (StructField sf : sourceOi.getAllStructFieldRefs()) {
        Category c = sf.getFieldObjectInspector().getCategory();
        if (c != Category.PRIMITIVE) {
            LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter: " + c + " is not supported");
            return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
        }
    }
    LlapIoImpl.LOG.info("Creating VertorDeserializeOrcWriter for " + path);
    return new VectorDeserializeOrcWriter(daemonConf, tblProps, sourceOi, sourceIncludes, cacheIncludes, allocSize);
}

Also used : DeserializerOrcWriter(org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.DeserializerOrcWriter) Path(org.apache.hadoop.fs.Path) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Properties(java.util.Properties)

Aggregations

TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)49 InputSplit (org.apache.hadoop.mapred.InputSplit)39 IOException (java.io.IOException)26 Path (org.apache.hadoop.fs.Path)25 JobConf (org.apache.hadoop.mapred.JobConf)24 LongWritable (org.apache.hadoop.io.LongWritable)19 Text (org.apache.hadoop.io.Text)19 ArrayList (java.util.ArrayList)16 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 ExecutorService (java.util.concurrent.ExecutorService)12 Future (java.util.concurrent.Future)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)6 Configuration (org.apache.hadoop.conf.Configuration)4 Pair (org.apache.sysml.runtime.matrix.data.Pair)4 LinkedList (java.util.LinkedList)3 Properties (java.util.Properties)3 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3