Search in sources :

Example 1 with HadoopFileBlock

use of org.apache.ignite.internal.processors.hadoop.HadoopFileBlock in project ignite by apache.

the class IgniteHadoopWeightedMapReducePlanner method affinityNodesForSplit.

/**
     * Get affinity nodes for the given input split.
     * <p>
     * Order in the returned collection *is* significant, meaning that nodes containing more data
     * go first. This way, the 1st nodes in the collection considered to be preferable for scheduling.
     *
     * @param split Split.
     * @param top Topology.
     * @return Affintiy nodes.
     * @throws IgniteCheckedException If failed.
     */
private Collection<UUID> affinityNodesForSplit(HadoopInputSplit split, HadoopMapReducePlanTopology top) throws IgniteCheckedException {
    Collection<UUID> igfsNodeIds = igfsAffinityNodesForSplit(split);
    if (igfsNodeIds != null)
        return igfsNodeIds;
    Map<NodeIdAndLength, UUID> res = new TreeMap<>();
    for (String host : split.hosts()) {
        long len = split instanceof HadoopFileBlock ? ((HadoopFileBlock) split).length() : 0L;
        HadoopMapReducePlanGroup grp = top.groupForHost(host);
        if (grp != null) {
            for (int i = 0; i < grp.nodeCount(); i++) {
                UUID nodeId = grp.nodeId(i);
                res.put(new NodeIdAndLength(nodeId, len), nodeId);
            }
        }
    }
    return new LinkedHashSet<>(res.values());
}
Also used : LinkedHashSet(java.util.LinkedHashSet) UUID(java.util.UUID) TreeMap(java.util.TreeMap) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) HadoopMapReducePlanGroup(org.apache.ignite.internal.processors.hadoop.planner.HadoopMapReducePlanGroup) HadoopIgfsEndpoint(org.apache.ignite.internal.processors.hadoop.igfs.HadoopIgfsEndpoint)

Example 2 with HadoopFileBlock

use of org.apache.ignite.internal.processors.hadoop.HadoopFileBlock in project ignite by apache.

the class IgniteHadoopWeightedMapReducePlanner method igfsAffinityNodesForSplit.

/**
     * Get IGFS affinity nodes for split if possible.
     * <p>
     * Order in the returned collection *is* significant, meaning that nodes containing more data
     * go first. This way, the 1st nodes in the collection considered to be preferable for scheduling.
     *
     * @param split Input split.
     * @return IGFS affinity or {@code null} if IGFS is not available.
     * @throws IgniteCheckedException If failed.
     */
@Nullable
private Collection<UUID> igfsAffinityNodesForSplit(HadoopInputSplit split) throws IgniteCheckedException {
    if (split instanceof HadoopFileBlock) {
        HadoopFileBlock split0 = (HadoopFileBlock) split;
        if (IgniteFileSystem.IGFS_SCHEME.equalsIgnoreCase(split0.file().getScheme())) {
            HadoopIgfsEndpoint endpoint = new HadoopIgfsEndpoint(split0.file().getAuthority());
            IgfsEx igfs = (IgfsEx) ((IgniteEx) ignite).igfsx(endpoint.igfs());
            if (igfs != null && !igfs.isProxy(split0.file())) {
                IgfsPath path = new IgfsPath(split0.file());
                if (igfs.exists(path)) {
                    Collection<IgfsBlockLocation> blocks;
                    try {
                        blocks = igfs.affinity(path, split0.start(), split0.length());
                    } catch (IgniteException e) {
                        throw new IgniteCheckedException("Failed to get IGFS file block affinity [path=" + path + ", start=" + split0.start() + ", len=" + split0.length() + ']', e);
                    }
                    assert blocks != null;
                    if (blocks.size() == 1)
                        return blocks.iterator().next().nodeIds();
                    else {
                        // The most "local" nodes go first.
                        Map<UUID, Long> idToLen = new HashMap<>();
                        for (IgfsBlockLocation block : blocks) {
                            for (UUID id : block.nodeIds()) {
                                Long len = idToLen.get(id);
                                idToLen.put(id, len == null ? block.length() : block.length() + len);
                            }
                        }
                        // Sort the nodes in non-ascending order by contained data lengths.
                        Map<NodeIdAndLength, UUID> res = new TreeMap<>();
                        for (Map.Entry<UUID, Long> idToLenEntry : idToLen.entrySet()) {
                            UUID id = idToLenEntry.getKey();
                            res.put(new NodeIdAndLength(id, idToLenEntry.getValue()), id);
                        }
                        return new LinkedHashSet<>(res.values());
                    }
                }
            }
        }
    }
    return null;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) IdentityHashMap(java.util.IdentityHashMap) HadoopIgfsEndpoint(org.apache.ignite.internal.processors.hadoop.igfs.HadoopIgfsEndpoint) IgfsBlockLocation(org.apache.ignite.igfs.IgfsBlockLocation) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) TreeMap(java.util.TreeMap) IgfsPath(org.apache.ignite.igfs.IgfsPath) IgfsEx(org.apache.ignite.internal.processors.igfs.IgfsEx) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) UUID(java.util.UUID) HashMap(java.util.HashMap) Map(java.util.Map) IdentityHashMap(java.util.IdentityHashMap) TreeMap(java.util.TreeMap) Nullable(org.jetbrains.annotations.Nullable)

Example 3 with HadoopFileBlock

use of org.apache.ignite.internal.processors.hadoop.HadoopFileBlock in project ignite by apache.

the class HadoopV2Job method input.

/** {@inheritDoc} */
@Override
public Collection<HadoopInputSplit> input() {
    ClassLoader oldLdr = HadoopCommonUtils.setContextClassLoader(jobConf.getClassLoader());
    try {
        String jobDirPath = jobConf.get(MRJobConfig.MAPREDUCE_JOB_DIR);
        if (jobDirPath == null) {
            // Assume that we have needed classes and try to generate input splits ourself.
            if (jobConf.getUseNewMapper())
                return HadoopV2Splitter.splitJob(jobCtx);
            else
                return HadoopV1Splitter.splitJob(jobConf);
        }
        Path jobDir = new Path(jobDirPath);
        try {
            FileSystem fs = fileSystem(jobDir.toUri(), jobConf);
            JobSplit.TaskSplitMetaInfo[] metaInfos = SplitMetaInfoReader.readSplitMetaInfo(hadoopJobID, fs, jobConf, jobDir);
            if (F.isEmpty(metaInfos))
                throw new IgniteCheckedException("No input splits found.");
            Path splitsFile = JobSubmissionFiles.getJobSplitFile(jobDir);
            try (FSDataInputStream in = fs.open(splitsFile)) {
                Collection<HadoopInputSplit> res = new ArrayList<>(metaInfos.length);
                for (JobSplit.TaskSplitMetaInfo metaInfo : metaInfos) {
                    long off = metaInfo.getStartOffset();
                    String[] hosts = metaInfo.getLocations();
                    in.seek(off);
                    String clsName = Text.readString(in);
                    HadoopFileBlock block = HadoopV1Splitter.readFileBlock(clsName, in, hosts);
                    if (block == null)
                        block = HadoopV2Splitter.readFileBlock(clsName, in, hosts);
                    res.add(block != null ? block : new HadoopExternalSplit(hosts, off));
                }
                return res;
            }
        } catch (Throwable e) {
            if (e instanceof Error)
                throw (Error) e;
            else
                throw transformException(e);
        }
    } catch (IgniteCheckedException e) {
        throw new IgniteException(e);
    } finally {
        HadoopCommonUtils.restoreContextClassLoader(oldLdr);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) JobSplit(org.apache.hadoop.mapreduce.split.JobSplit) IgniteException(org.apache.ignite.IgniteException) FileSystem(org.apache.hadoop.fs.FileSystem) HadoopClassLoader(org.apache.ignite.internal.processors.hadoop.HadoopClassLoader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) HadoopExternalSplit(org.apache.ignite.internal.processors.hadoop.HadoopExternalSplit)

Example 4 with HadoopFileBlock

use of org.apache.ignite.internal.processors.hadoop.HadoopFileBlock in project ignite by apache.

the class HadoopV2Splitter method splitJob.

/**
     * @param ctx Job context.
     * @return Collection of mapped splits.
     * @throws IgniteCheckedException If mapping failed.
     */
public static Collection<HadoopInputSplit> splitJob(JobContext ctx) throws IgniteCheckedException {
    try {
        InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration());
        assert format != null;
        List<InputSplit> splits = format.getSplits(ctx);
        Collection<HadoopInputSplit> res = new ArrayList<>(splits.size());
        int id = 0;
        for (InputSplit nativeSplit : splits) {
            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;
                res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength()));
            } else
                res.add(HadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations()));
            id++;
        }
        return res;
    } catch (IOException | ClassNotFoundException e) {
        throw new IgniteCheckedException(e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new IgniteInterruptedCheckedException(e);
    }
}
Also used : ArrayList(java.util.ArrayList) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit)

Example 5 with HadoopFileBlock

use of org.apache.ignite.internal.processors.hadoop.HadoopFileBlock in project ignite by apache.

the class HadoopWeightedMapReducePlannerTest method testHdfsSplitsReplication.

/**
     * Test HDFS splits with Replication == 3.
     *
     * @throws Exception If failed.
     */
public void testHdfsSplitsReplication() throws Exception {
    IgfsMock igfs = LocationsBuilder.create().add(0, NODE_1).add(50, NODE_2).add(100, NODE_3).buildIgfs();
    final List<HadoopInputSplit> splits = new ArrayList<>();
    splits.add(new HadoopFileBlock(new String[] { HOST_1, HOST_2, HOST_3 }, URI.create("hfds://" + HOST_1 + "/x"), 0, 50));
    splits.add(new HadoopFileBlock(new String[] { HOST_2, HOST_3, HOST_4 }, URI.create("hfds://" + HOST_2 + "/x"), 50, 100));
    splits.add(new HadoopFileBlock(new String[] { HOST_3, HOST_4, HOST_5 }, URI.create("hfds://" + HOST_3 + "/x"), 100, 37));
    // The following splits belong to hosts that are out of Ignite topology at all.
    // This means that these splits should be assigned to any least loaded modes:
    splits.add(new HadoopFileBlock(new String[] { HOST_4, HOST_5, HOST_1 }, URI.create("hfds://" + HOST_4 + "/x"), 138, 2));
    splits.add(new HadoopFileBlock(new String[] { HOST_5, HOST_1, HOST_2 }, URI.create("hfds://" + HOST_5 + "/x"), 140, 3));
    final int expReducers = 8;
    HadoopPlannerMockJob job = new HadoopPlannerMockJob(splits, expReducers);
    IgniteHadoopWeightedMapReducePlanner planner = createPlanner(igfs);
    final HadoopMapReducePlan plan = planner.preparePlan(job, NODES, null);
    checkPlanMappers(plan, splits, NODES, true);
    checkPlanReducers(plan, NODES, expReducers, true);
}
Also used : HadoopMapReducePlan(org.apache.ignite.hadoop.HadoopMapReducePlan) IgniteHadoopWeightedMapReducePlanner(org.apache.ignite.hadoop.mapreduce.IgniteHadoopWeightedMapReducePlanner) ArrayList(java.util.ArrayList) IgfsMock(org.apache.ignite.internal.processors.igfs.IgfsMock) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock)

Aggregations

HadoopFileBlock (org.apache.ignite.internal.processors.hadoop.HadoopFileBlock)12 HadoopInputSplit (org.apache.ignite.hadoop.HadoopInputSplit)8 ArrayList (java.util.ArrayList)6 IgniteCheckedException (org.apache.ignite.IgniteCheckedException)6 Path (org.apache.hadoop.fs.Path)3 HadoopMapReducePlan (org.apache.ignite.hadoop.HadoopMapReducePlan)3 IgniteHadoopWeightedMapReducePlanner (org.apache.ignite.hadoop.mapreduce.IgniteHadoopWeightedMapReducePlanner)3 IgfsPath (org.apache.ignite.igfs.IgfsPath)3 HadoopJobEx (org.apache.ignite.internal.processors.hadoop.HadoopJobEx)3 IgfsMock (org.apache.ignite.internal.processors.igfs.IgfsMock)3 IOException (java.io.IOException)2 URI (java.net.URI)2 LinkedHashSet (java.util.LinkedHashSet)2 TreeMap (java.util.TreeMap)2 UUID (java.util.UUID)2 FileSplit (org.apache.hadoop.mapred.FileSplit)2 InputSplit (org.apache.hadoop.mapred.InputSplit)2 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)2 IgniteException (org.apache.ignite.IgniteException)2 HadoopTaskInfo (org.apache.ignite.internal.processors.hadoop.HadoopTaskInfo)2