Examples with TextInputFormat - org.apache.hadoop.mapred.TextInputFormat

Example 26 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project hive by apache.

the class SymlinkTextInputFormat method getSplits.

/**
 * Parses all target paths from job input directory which contains symlink
 * files, and splits the target data using TextInputFormat.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
    if (symlinksDirs.length == 0) {
        throw new IOException("No input paths specified in job.");
    }
    // Get all target paths first, because the number of total target paths
    // is used to determine number of splits of each target path.
    List<Path> targetPaths = new ArrayList<Path>();
    List<Path> symlinkPaths = new ArrayList<Path>();
    try {
        getTargetPathsFromSymlinksDirs(job, symlinksDirs, targetPaths, symlinkPaths);
    } catch (Exception e) {
        throw new IOException("Error parsing symlinks from specified job input path.", e);
    }
    if (targetPaths.size() == 0) {
        return new InputSplit[0];
    }
    // The input should be in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    JobConf newjob = new JobConf(job);
    newjob.setInputFormat(TextInputFormat.class);
    inputFormat.configure(newjob);
    List<InputSplit> result = new ArrayList<InputSplit>();
    // ceil(numSplits / numPaths), so we can get at least numSplits splits.
    int numPaths = targetPaths.size();
    int numSubSplits = (numSplits + numPaths - 1) / numPaths;
    // For each path, do getSplits().
    for (int i = 0; i < numPaths; ++i) {
        Path targetPath = targetPaths.get(i);
        Path symlinkPath = symlinkPaths.get(i);
        FileInputFormat.setInputPaths(newjob, targetPath);
        InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
        for (InputSplit is : iss) {
            result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit) is));
        }
    }
    return result.toArray(new InputSplit[result.size()]);
}

Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 27 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project apex-malhar by apache.

the class MapOperatorTest method testNodeProcessingSchema.

public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException {
    CollectorTestSink sortSink = new CollectorTestSink();
    oper.output.setSink(sortSink);
    oper.setMapClass(WordCount.Map.class);
    oper.setCombineClass(WordCount.Reduce.class);
    oper.setDirName(testMeta.testDir);
    oper.setConfigFile(null);
    oper.setInputFormatClass(TextInputFormat.class);
    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);
    FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir));
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(jobConf);
    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
    keySerializer.open(oper.getOutstream());
    keySerializer.serialize(splits[0]);
    oper.setInputSplitClass(splits[0].getClass());
    keySerializer.close();
    oper.setup(null);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.emitTuples();
    oper.endWindow();
    oper.beginWindow(1);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size());
    for (Object o : sortSink.collectedTuples) {
        LOG.debug(o.toString());
    }
    LOG.debug("Done testing round\n");
    oper.teardown();
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink) Serializer(org.apache.hadoop.io.serializer.Serializer)

Example 28 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project hadoop by apache.

the class ValueAggregatorJob method createValueAggregatorJob.

/**
   * Create an Aggregate based map/reduce job.
   *
   * @param args the arguments used for job creation. Generic hadoop
   * arguments are accepted.
   * @param caller the the caller class.
   * @return a JobConf object ready for submission.
   *
   * @throws IOException
   * @see GenericOptionsParser
   */
@SuppressWarnings("rawtypes")
public static JobConf createValueAggregatorJob(String[] args, Class<?> caller) throws IOException {
    Configuration conf = new Configuration();
    GenericOptionsParser genericParser = new GenericOptionsParser(conf, args);
    args = genericParser.getRemainingArgs();
    if (args.length < 2) {
        System.out.println("usage: inputDirs outDir " + "[numOfReducer [textinputformat|seq [specfile [jobName]]]]");
        GenericOptionsParser.printGenericCommandUsage(System.out);
        System.exit(1);
    }
    String inputDir = args[0];
    String outputDir = args[1];
    int numOfReducers = 1;
    if (args.length > 2) {
        numOfReducers = Integer.parseInt(args[2]);
    }
    Class<? extends InputFormat> theInputFormat = TextInputFormat.class;
    if (args.length > 3 && args[3].compareToIgnoreCase("textinputformat") == 0) {
        theInputFormat = TextInputFormat.class;
    } else {
        theInputFormat = SequenceFileInputFormat.class;
    }
    Path specFile = null;
    if (args.length > 4) {
        specFile = new Path(args[4]);
    }
    String jobName = "";
    if (args.length > 5) {
        jobName = args[5];
    }
    JobConf theJob = new JobConf(conf);
    if (specFile != null) {
        theJob.addResource(specFile);
    }
    String userJarFile = theJob.get("user.jar.file");
    if (userJarFile == null) {
        theJob.setJarByClass(caller != null ? caller : ValueAggregatorJob.class);
    } else {
        theJob.setJar(userJarFile);
    }
    theJob.setJobName("ValueAggregatorJob: " + jobName);
    FileInputFormat.addInputPaths(theJob, inputDir);
    theJob.setInputFormat(theInputFormat);
    theJob.setMapperClass(ValueAggregatorMapper.class);
    FileOutputFormat.setOutputPath(theJob, new Path(outputDir));
    theJob.setOutputFormat(TextOutputFormat.class);
    theJob.setMapOutputKeyClass(Text.class);
    theJob.setMapOutputValueClass(Text.class);
    theJob.setOutputKeyClass(Text.class);
    theJob.setOutputValueClass(Text.class);
    theJob.setReducerClass(ValueAggregatorReducer.class);
    theJob.setCombinerClass(ValueAggregatorCombiner.class);
    theJob.setNumMapTasks(1);
    theJob.setNumReduceTasks(numOfReducers);
    return theJob;
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 29 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project SQLWindowing by hbutani.

the class IOUtils method createTableWindowingInput.

@SuppressWarnings("unchecked")
public static WindowingInput createTableWindowingInput(String dbName, String tableName, Configuration conf) throws WindowingException {
    try {
        HiveMetaStoreClient client = HiveUtils.getClient(conf);
        String db = HiveUtils.validateDB(client, dbName);
        Table t = HiveUtils.getTable(client, db, tableName);
        StorageDescriptor sd = t.getSd();
        HiveConf hConf = new HiveConf(conf, IOUtils.class);
        JobConf job = new JobConf(hConf);
        Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(sd.getInputFormat());
        hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
        hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
        InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
        if (iFmt instanceof TextInputFormat) {
            ((TextInputFormat) iFmt).configure(job);
        }
        Path p = new Path(sd.getLocation());
        /*
			 * Convert the Path in the StorageDescriptor into a Path in the current FileSystem.
			 * Used in testing: Jobs run on MiniDFSCluster, whereas hive metadata refers to a real cluster.
			 */
        {
            p = makeQualified(p, conf);
        }
        FileInputFormat.addInputPath(job, p);
        InputSplit[] iSplits = iFmt.getSplits(job, 1);
        org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
        hConf.set(INPUT_PATH, sd.getLocation());
        hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
        hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
        hConf.set(INPUT_SERDE_CLASS, sd.getSerdeInfo().getSerializationLib());
        TableWindowingInput tIn = new TableWindowingInput();
        tIn.initialize(null, hConf, MetaStoreUtils.getSchema(t));
        return tIn;
    } catch (WindowingException w) {
        throw w;
    } catch (Exception e) {
        throw new WindowingException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) Table(org.apache.hadoop.hive.metastore.api.Table) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) WindowingException(com.sap.hadoop.windowing.WindowingException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) WindowingException(com.sap.hadoop.windowing.WindowingException) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 30 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class ResultMergeLocalFile method mergeTextCellWithoutComp.

private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
    try {
        // delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            // we're done
            return;
        }
        // actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameNew);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
        String valueStr = null;
        try {
            for (// read/write all inputs
            MatrixObject in : // read/write all inputs
            inMO) {
                if (LOG.isTraceEnabled())
                    LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);
                LongWritable key = new LongWritable();
                Text value = new Text();
                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        IOUtilFunctions.closeSilently(reader);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)49 InputSplit (org.apache.hadoop.mapred.InputSplit)39 IOException (java.io.IOException)26 Path (org.apache.hadoop.fs.Path)25 JobConf (org.apache.hadoop.mapred.JobConf)24 LongWritable (org.apache.hadoop.io.LongWritable)19 Text (org.apache.hadoop.io.Text)19 ArrayList (java.util.ArrayList)16 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 ExecutorService (java.util.concurrent.ExecutorService)12 Future (java.util.concurrent.Future)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)6 Configuration (org.apache.hadoop.conf.Configuration)4 Pair (org.apache.sysml.runtime.matrix.data.Pair)4 LinkedList (java.util.LinkedList)3 Properties (java.util.Properties)3 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3