Search in sources :

Example 11 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulerLargerHDFS.

/**
     * Test the case where the HDFS cluster is a larger than the Hyracks cluster
     *
     * @throws Exception
     */
public void testSchedulerLargerHDFS() throws Exception {
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
    List<InputSplit> fileSplits = new ArrayList<>();
    fileSplits.add(new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" }));
    fileSplits.add(new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.6" }));
    fileSplits.add(new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.6" }));
    fileSplits.add(new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-7"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" }));
    fileSplits.add(new FileSplit(new Path("part-8"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-9"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.6" }));
    fileSplits.add(new FileSplit(new Path("part-10"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.6" }));
    fileSplits.add(new FileSplit(new Path("part-11"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.7" }));
    fileSplits.add(new FileSplit(new Path("part-12"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" }));
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    String[] expectedResults = new String[] { "nc1", "nc4", "nc6", "nc1", "nc4", "nc2", "nc2", "nc3", "nc6", "nc5", "nc3", "nc5" };
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) NodeControllerInfo(org.apache.hyracks.api.client.NodeControllerInfo) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 12 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project asterixdb by apache.

the class FileSplitsFactory method bytesToSplits.

/**
     * Covert bytes to splits.
     *
     * @param bytes
     * @return
     * @throws HyracksDataException
     */
private List<FileSplit> bytesToSplits(byte[] bytes) throws HyracksDataException {
    try {
        Class splitClass = Class.forName(splitClassName);
        Constructor[] constructors = splitClass.getDeclaredConstructors();
        Constructor defaultConstructor = null;
        for (Constructor constructor : constructors) {
            if (constructor.getParameterTypes().length == 0) {
                constructor.setAccessible(true);
                defaultConstructor = constructor;
            }
        }
        ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
        DataInputStream dis = new DataInputStream(bis);
        int size = dis.readInt();
        List<FileSplit> splits = new ArrayList<FileSplit>();
        for (int i = 0; i < size; i++) {
            splits.add((FileSplit) defaultConstructor.newInstance());
            splits.get(i).readFields(dis);
        }
        dis.close();
        return splits;
    } catch (Exception e) {
        throw new HyracksDataException(e);
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) Constructor(java.lang.reflect.Constructor) ArrayList(java.util.ArrayList) DataInputStream(java.io.DataInputStream) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IOException(java.io.IOException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException)

Example 13 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project carbondata by apache.

the class CarbonInputFormat method getSplitsInternal.

private List<InputSplit> getSplitsInternal(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    List<InputSplit> carbonSplits = new ArrayList<InputSplit>(splits.size());
    // identify table blocks
    for (InputSplit inputSplit : splits) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        String segmentId = CarbonTablePath.DataPathUtil.getSegmentId(fileSplit.getPath().toString());
        if (segmentId.equals(CarbonCommonConstants.INVALID_SEGMENT_ID)) {
            continue;
        }
        carbonSplits.add(CarbonInputSplit.from(segmentId, fileSplit, ColumnarFormatVersion.valueOf(CarbonCommonConstants.CARBON_DATA_FILE_DEFAULT_VERSION)));
    }
    return carbonSplits;
}
Also used : FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 14 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project gora by apache.

the class FileBackedDataStoreBase method getPartitions.

@Override
public List<PartitionQuery<K, T>> getPartitions(Query<K, T> query) {
    List<InputSplit> splits = null;
    List<PartitionQuery<K, T>> queries = null;
    try {
        splits = GoraMapReduceUtils.getSplits(getConf(), inputPath);
        queries = new ArrayList<>(splits.size());
        for (InputSplit split : splits) {
            queries.add(new FileSplitPartitionQuery<>(query, (FileSplit) split));
        }
    } catch (IOException ex) {
        LOG.error(ex.getMessage(), ex);
    }
    return queries;
}
Also used : IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) PartitionQuery(org.apache.gora.query.PartitionQuery) FileSplitPartitionQuery(org.apache.gora.query.impl.FileSplitPartitionQuery)

Example 15 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project ignite by apache.

the class HadoopV2Splitter method splitJob.

/**
     * @param ctx Job context.
     * @return Collection of mapped splits.
     * @throws IgniteCheckedException If mapping failed.
     */
public static Collection<HadoopInputSplit> splitJob(JobContext ctx) throws IgniteCheckedException {
    try {
        InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration());
        assert format != null;
        List<InputSplit> splits = format.getSplits(ctx);
        Collection<HadoopInputSplit> res = new ArrayList<>(splits.size());
        int id = 0;
        for (InputSplit nativeSplit : splits) {
            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;
                res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength()));
            } else
                res.add(HadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations()));
            id++;
        }
        return res;
    } catch (IOException | ClassNotFoundException e) {
        throw new IgniteCheckedException(e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new IgniteInterruptedCheckedException(e);
    }
}
Also used : ArrayList(java.util.ArrayList) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit)

Aggregations

FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)39 Path (org.apache.hadoop.fs.Path)22 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)12 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 FileSystem (org.apache.hadoop.fs.FileSystem)7 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Text (org.apache.hadoop.io.Text)3 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)3 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 Constructor (java.lang.reflect.Constructor)2 Schema (org.apache.avro.Schema)2 AvroKeyRecordReader (org.apache.avro.mapreduce.AvroKeyRecordReader)2 FileSplitPartitionQuery (org.apache.gora.query.impl.FileSplitPartitionQuery)2 FileStatus (org.apache.hadoop.fs.FileStatus)2