Examples with InputSplit - org.apache.hadoop.mapred.InputSplit

Example 31 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulerSimple.

/**
     * Test the scheduler for the case when the Hyracks cluster is the HDFS cluster
     *
     * @throws Exception
     */
public void testSchedulerSimple() throws Exception {
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
    InputSplit[] fileSplits = new InputSplit[6];
    fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.6" });
    fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.6" });
    fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" });
    String[] expectedResults = new String[] { "nc1", "nc4", "nc6", "nc2", "nc3", "nc5" };
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    ClusterTopology topology = parseTopology();
    scheduler = new Scheduler(ncNameToNcInfos, topology);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) NodeControllerInfo(org.apache.hyracks.api.client.NodeControllerInfo) FileSplit(org.apache.hadoop.mapred.FileSplit) ClusterTopology(org.apache.hyracks.api.topology.ClusterTopology) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 32 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulerLargerHDFS.

/**
     * Test the case where the HDFS cluster is a larger than the Hyracks cluster
     *
     * @throws Exception
     */
public void testSchedulerLargerHDFS() throws Exception {
    int dataPort = 5099;
    int resultPort = 5098;
    int messagingPort = 5097;
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(4, "nc", "10.0.0.", dataPort, resultPort, messagingPort);
    ncNameToNcInfos.put("nc7", new NodeControllerInfo("nc7", NodeStatus.ALIVE, new NetworkAddress("10.0.0.7", dataPort), new NetworkAddress("10.0.0.5", resultPort), new NetworkAddress("10.0.0.5", messagingPort), 2));
    ncNameToNcInfos.put("nc12", new NodeControllerInfo("nc12", NodeStatus.ALIVE, new NetworkAddress("10.0.0.12", dataPort), new NetworkAddress("10.0.0.5", resultPort), new NetworkAddress("10.0.0.5", messagingPort), 2));
    InputSplit[] fileSplits = new InputSplit[12];
    fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.6" });
    fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.6" });
    fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" });
    fileSplits[6] = new FileSplit(new Path("part-7"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[7] = new FileSplit(new Path("part-8"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[8] = new FileSplit(new Path("part-12"), 0, 0, new String[] { "10.0.0.14", "10.0.0.11", "10.0.0.13" });
    fileSplits[9] = new FileSplit(new Path("part-10"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.6" });
    fileSplits[10] = new FileSplit(new Path("part-11"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.7" });
    fileSplits[11] = new FileSplit(new Path("part-9"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.6" });
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc12", "nc7", "nc7", "nc12" };
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc7", "nc12", "nc7", "nc12" };
    ClusterTopology topology = parseTopology();
    scheduler = new Scheduler(ncNameToNcInfos, topology);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) NetworkAddress(org.apache.hyracks.api.comm.NetworkAddress) NodeControllerInfo(org.apache.hyracks.api.client.NodeControllerInfo) FileSplit(org.apache.hadoop.mapred.FileSplit) ClusterTopology(org.apache.hyracks.api.topology.ClusterTopology) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 33 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulerSmallerHDFSOdd.

/**
     * Test the case where the HDFS cluster is a larger than the Hyracks cluster
     *
     * @throws Exception
     */
public void testSchedulerSmallerHDFSOdd() throws Exception {
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
    InputSplit[] fileSplits = new InputSplit[13];
    fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.3" });
    fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.3" });
    fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" });
    fileSplits[6] = new FileSplit(new Path("part-7"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[7] = new FileSplit(new Path("part-8"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[8] = new FileSplit(new Path("part-9"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.1" });
    fileSplits[9] = new FileSplit(new Path("part-10"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.2" });
    fileSplits[10] = new FileSplit(new Path("part-11"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[11] = new FileSplit(new Path("part-12"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" });
    fileSplits[12] = new FileSplit(new Path("part-13"), 0, 0, new String[] { "10.0.0.2", "10.0.0.4", "10.0.0.5" });
    String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc5", "nc1", "nc5", "nc2", "nc4" };
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    ClusterTopology topology = parseTopology();
    scheduler = new Scheduler(ncNameToNcInfos, topology);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}

Example 34 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class HDFSReadOperatorDescriptor method createPushRuntime.

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException {
    final InputSplit[] inputSplits = splitsFactory.getSplits();
    return new AbstractUnaryOutputSourceOperatorNodePushable() {

        private String nodeName = ctx.getJobletContext().getServiceContext().getNodeId();

        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                JobConf conf = confFactory.getConf();
                conf.setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                try {
                    parser.open(writer);
                    InputFormat inputFormat = conf.getInputFormat();
                    for (int i = 0; i < inputSplits.length; i++) {
                        /**
                             * read all the partitions scheduled to the current node
                             */
                        if (scheduledLocations[i].equals(nodeName)) {
                            /**
                                 * pick an unread split to read
                                 * synchronize among simultaneous partitions in the same machine
                                 */
                            synchronized (executed) {
                                if (executed[i] == false) {
                                    executed[i] = true;
                                } else {
                                    continue;
                                }
                            }
                            /**
                                 * read the split
                                 */
                            RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
                            Object key = reader.createKey();
                            Object value = reader.createValue();
                            while (reader.next(key, value) == true) {
                                parser.parse(key, value, writer, inputSplits[i].toString());
                            }
                        }
                    }
                } finally {
                    parser.close(writer);
                }
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

Also used : AbstractUnaryOutputSourceOperatorNodePushable(org.apache.hyracks.dataflow.std.base.AbstractUnaryOutputSourceOperatorNodePushable) RecordReader(org.apache.hadoop.mapred.RecordReader) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IKeyValueParser(org.apache.hyracks.hdfs.api.IKeyValueParser) InputFormat(org.apache.hadoop.mapred.InputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 35 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class RackAwareNcCollectionBuilder method build.

@Override
public INcCollection build(Map<String, NodeControllerInfo> ncNameToNcInfos, final Map<String, List<String>> ipToNcMapping, final Map<String, Integer> ncNameToIndex, String[] NCs, final int[] workloads, final int slotLimit) {
    try {
        final Map<List<Integer>, List<String>> pathToNCs = new HashMap<List<Integer>, List<String>>();
        for (int i = 0; i < NCs.length; i++) {
            List<Integer> path = new ArrayList<Integer>();
            String ipAddress = InetAddress.getByAddress(ncNameToNcInfos.get(NCs[i]).getNetworkAddress().lookupIpAddress()).getHostAddress();
            topology.lookupNetworkTerminal(ipAddress, path);
            if (path.size() <= 0) {
                // if the hyracks nc is not in the defined cluster
                path.add(Integer.MIN_VALUE);
                LOGGER.info(NCs[i] + "'s IP address is not in the cluster toplogy file!");
            }
            List<String> ncs = pathToNCs.get(path);
            if (ncs == null) {
                ncs = new ArrayList<String>();
                pathToNCs.put(path, ncs);
            }
            ncs.add(NCs[i]);
        }
        final TreeMap<List<Integer>, IntWritable> availableIpsToSlots = new TreeMap<List<Integer>, IntWritable>(new Comparator<List<Integer>>() {

            @Override
            public int compare(List<Integer> l1, List<Integer> l2) {
                int commonLength = Math.min(l1.size(), l2.size());
                for (int i = 0; i < commonLength; i++) {
                    Integer value1 = l1.get(i);
                    Integer value2 = l2.get(i);
                    int cmp = value1 > value2 ? 1 : (value1 < value2 ? -1 : 0);
                    if (cmp != 0) {
                        return cmp;
                    }
                }
                return l1.size() > l2.size() ? 1 : (l1.size() < l2.size() ? -1 : 0);
            }
        });
        for (int i = 0; i < workloads.length; i++) {
            if (workloads[i] < slotLimit) {
                List<Integer> path = new ArrayList<Integer>();
                String ipAddress = InetAddress.getByAddress(ncNameToNcInfos.get(NCs[i]).getNetworkAddress().lookupIpAddress()).getHostAddress();
                topology.lookupNetworkTerminal(ipAddress, path);
                if (path.size() <= 0) {
                    // if the hyracks nc is not in the defined cluster
                    path.add(Integer.MIN_VALUE);
                }
                IntWritable availableSlot = availableIpsToSlots.get(path);
                if (availableSlot == null) {
                    availableSlot = new IntWritable(slotLimit - workloads[i]);
                    availableIpsToSlots.put(path, availableSlot);
                } else {
                    availableSlot.set(slotLimit - workloads[i] + availableSlot.get());
                }
            }
        }
        return new INcCollection() {

            @Override
            public String findNearestAvailableSlot(InputSplit split) {
                try {
                    String[] locs = split.getLocations();
                    int minDistance = Integer.MAX_VALUE;
                    List<Integer> currentCandidatePath = null;
                    if (locs == null || locs.length > 0) {
                        for (int j = 0; j < locs.length; j++) {
                            /**
                                 * get all the IP addresses from the name
                                 */
                            InetAddress[] allIps = InetAddress.getAllByName(locs[j]);
                            boolean inTopology = false;
                            for (InetAddress ip : allIps) {
                                List<Integer> splitPath = new ArrayList<Integer>();
                                boolean inCluster = topology.lookupNetworkTerminal(ip.getHostAddress(), splitPath);
                                if (!inCluster) {
                                    continue;
                                }
                                inTopology = true;
                                /**
                                     * if the node controller exists
                                     */
                                List<Integer> candidatePath = availableIpsToSlots.floorKey(splitPath);
                                if (candidatePath == null) {
                                    candidatePath = availableIpsToSlots.ceilingKey(splitPath);
                                }
                                if (candidatePath != null) {
                                    if (availableIpsToSlots.get(candidatePath).get() > 0) {
                                        int distance = distance(splitPath, candidatePath);
                                        if (minDistance > distance) {
                                            minDistance = distance;
                                            currentCandidatePath = candidatePath;
                                        }
                                    }
                                }
                            }
                            if (!inTopology) {
                                LOGGER.info(locs[j] + "'s IP address is not in the cluster toplogy file!");
                                /**
                                     * if the machine is not in the toplogy file
                                     */
                                List<Integer> candidatePath = null;
                                for (Entry<List<Integer>, IntWritable> entry : availableIpsToSlots.entrySet()) {
                                    if (entry.getValue().get() > 0) {
                                        candidatePath = entry.getKey();
                                        break;
                                    }
                                }
                                /** the split path is empty */
                                if (candidatePath != null) {
                                    if (availableIpsToSlots.get(candidatePath).get() > 0) {
                                        currentCandidatePath = candidatePath;
                                    }
                                }
                            }
                        }
                    } else {
                        for (Entry<List<Integer>, IntWritable> entry : availableIpsToSlots.entrySet()) {
                            if (entry.getValue().get() > 0) {
                                currentCandidatePath = entry.getKey();
                                break;
                            }
                        }
                    }
                    if (currentCandidatePath != null && currentCandidatePath.size() > 0) {
                        /**
                             * Update the entry of the selected IP
                             */
                        IntWritable availableSlot = availableIpsToSlots.get(currentCandidatePath);
                        availableSlot.set(availableSlot.get() - 1);
                        if (availableSlot.get() == 0) {
                            availableIpsToSlots.remove(currentCandidatePath);
                        }
                        /**
                             * Update the entry of the selected NC
                             */
                        List<String> candidateNcs = pathToNCs.get(currentCandidatePath);
                        for (String candidate : candidateNcs) {
                            int ncIndex = ncNameToIndex.get(candidate);
                            if (workloads[ncIndex] < slotLimit) {
                                return candidate;
                            }
                        }
                    }
                    /** not scheduled */
                    return null;
                } catch (Exception e) {
                    throw new IllegalStateException(e);
                }
            }

            @Override
            public int numAvailableSlots() {
                return availableIpsToSlots.size();
            }

            private int distance(List<Integer> splitPath, List<Integer> candidatePath) {
                int commonLength = Math.min(splitPath.size(), candidatePath.size());
                int distance = 0;
                for (int i = 0; i < commonLength; i++) {
                    distance = distance * 100 + Math.abs(splitPath.get(i) - candidatePath.get(i));
                }
                List<Integer> restElements = splitPath.size() > candidatePath.size() ? splitPath : candidatePath;
                for (int i = commonLength; i < restElements.size(); i++) {
                    distance = distance * 100 + Math.abs(restElements.get(i));
                }
                return distance;
            }
        };
    } catch (Exception e) {
        throw new IllegalStateException(e);
    }
}

Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) INcCollection(org.apache.hyracks.hdfs.api.INcCollection) TreeMap(java.util.TreeMap) ArrayList(java.util.ArrayList) List(java.util.List) InputSplit(org.apache.hadoop.mapred.InputSplit) InetAddress(java.net.InetAddress) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8