Search in sources :

Example 36 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project ignite by apache.

the class HadoopSplitWrapperSelfTest method testSerialization.

/**
     * Tests serialization of wrapper and the wrapped native split.
     * @throws Exception If fails.
     */
public void testSerialization() throws Exception {
    FileSplit nativeSplit = new FileSplit(new Path("/path/to/file"), 100, 500, new String[] { "host1", "host2" });
    assertEquals("/path/to/file:100+500", nativeSplit.toString());
    HadoopSplitWrapper split = HadoopUtils.wrapSplit(10, nativeSplit, nativeSplit.getLocations());
    assertEquals("[host1, host2]", Arrays.toString(split.hosts()));
    ByteArrayOutputStream buf = new ByteArrayOutputStream();
    ObjectOutput out = new ObjectOutputStream(buf);
    out.writeObject(split);
    ObjectInput in = new ObjectInputStream(new ByteArrayInputStream(buf.toByteArray()));
    final HadoopSplitWrapper res = (HadoopSplitWrapper) in.readObject();
    assertEquals("/path/to/file:100+500", HadoopUtils.unwrapSplit(res).toString());
    GridTestUtils.assertThrows(log, new Callable<Object>() {

        @Override
        public Object call() throws Exception {
            res.hosts();
            return null;
        }
    }, AssertionError.class, null);
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectOutput(java.io.ObjectOutput) ByteArrayOutputStream(java.io.ByteArrayOutputStream) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) ObjectOutputStream(java.io.ObjectOutputStream) HadoopSplitWrapper(org.apache.ignite.internal.processors.hadoop.HadoopSplitWrapper) ByteArrayInputStream(java.io.ByteArrayInputStream) ObjectInput(java.io.ObjectInput) ObjectInputStream(java.io.ObjectInputStream)

Example 37 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project ignite by apache.

the class HadoopV2Context method getInputSplit.

/** {@inheritDoc} */
@Override
public InputSplit getInputSplit() {
    if (inputSplit == null) {
        HadoopInputSplit split = ctx.taskInfo().inputSplit();
        if (split == null)
            return null;
        if (split instanceof HadoopFileBlock) {
            HadoopFileBlock fileBlock = (HadoopFileBlock) split;
            inputSplit = new FileSplit(new Path(fileBlock.file()), fileBlock.start(), fileBlock.length(), null);
        } else {
            try {
                inputSplit = (InputSplit) ((HadoopV2TaskContext) ctx).getNativeSplit(split);
            } catch (IgniteCheckedException e) {
                throw new IllegalStateException(e);
            }
        }
    }
    return inputSplit;
}
Also used : Path(org.apache.hadoop.fs.Path) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 38 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.

the class AbstractRdfReader method initialize.

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    LOG.debug("initialize({}, {})", genericSplit, context);
    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    // Find RDF language
    FileSplit split = (FileSplit) genericSplit;
    Path path = split.getPath();
    Lang lang = RDFLanguages.filenameToLang(path.getName());
    if (lang == null)
        throw new IOException("There is no registered RDF language for the input file " + path.toString());
    // Select the record reader and initialize
    this.reader = this.selectRecordReader(lang);
    this.reader.initialize(split, context);
}
Also used : Path(org.apache.hadoop.fs.Path) Lang(org.apache.jena.riot.Lang) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 39 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.

the class AbstractWholeFileNodeTupleReader method initialize.

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);
    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;
    // Configuration
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown.  Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
    // Figure out what portion of the file to read
    if (split.getStart() > 0)
        throw new IOException("This record reader requires a file split which covers the entire file");
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    this.compressionCodecs = factory.getCodec(file);
    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { split.getStart(), split.getLength(), totalLength }));
    if (totalLength > split.getLength())
        throw new IOException("This record reader requires a file split which covers the entire file");
    // Open the file and prepare the input stream
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    this.length = split.getLength();
    if (this.compressionCodecs != null) {
        // Compressed input
        input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
    } else {
        // Uncompressed input
        input = new TrackedInputStream(fileIn);
    }
    // Set up background thread for parser
    iter = this.getPipedIterator();
    this.stream = this.getPipedStream(iter, this.input);
    RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
    Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
    this.parserThread = new Thread(parserRunnable);
    this.parserThread.setDaemon(true);
    this.parserThread.start();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) RDFParserBuilder(org.apache.jena.riot.RDFParserBuilder) TrackedInputStream(org.apache.jena.hadoop.rdf.io.input.util.TrackedInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)39 Path (org.apache.hadoop.fs.Path)22 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)12 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 FileSystem (org.apache.hadoop.fs.FileSystem)7 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Text (org.apache.hadoop.io.Text)3 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)3 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 Constructor (java.lang.reflect.Constructor)2 Schema (org.apache.avro.Schema)2 AvroKeyRecordReader (org.apache.avro.mapreduce.AvroKeyRecordReader)2 FileSplitPartitionQuery (org.apache.gora.query.impl.FileSplitPartitionQuery)2 FileStatus (org.apache.hadoop.fs.FileStatus)2