Search in sources :

Example 1 with LocatableInputSplit

use of org.apache.flink.core.io.LocatableInputSplit in project flink by apache.

the class ExecutionJobVertex method computeLocalInputSplitsPerTask.

// --------------------------------------------------------------------------------------------
//  Static / pre-assigned input splits
// --------------------------------------------------------------------------------------------
private List<LocatableInputSplit>[] computeLocalInputSplitsPerTask(InputSplit[] splits) throws JobException {
    final int numSubTasks = getParallelism();
    // sanity check
    if (numSubTasks > splits.length) {
        throw new JobException("Strictly local assignment requires at least as many splits as subtasks.");
    }
    // group the splits by host while preserving order per host
    Map<String, List<LocatableInputSplit>> splitsByHost = new HashMap<String, List<LocatableInputSplit>>();
    for (InputSplit split : splits) {
        // check that split has exactly one local host
        if (!(split instanceof LocatableInputSplit)) {
            throw new JobException("Invalid InputSplit type " + split.getClass().getCanonicalName() + ". " + "Strictly local assignment requires LocatableInputSplit");
        }
        LocatableInputSplit lis = (LocatableInputSplit) split;
        if (lis.getHostnames() == null) {
            throw new JobException("LocatableInputSplit has no host information. " + "Strictly local assignment requires exactly one hostname for each LocatableInputSplit.");
        } else if (lis.getHostnames().length != 1) {
            throw new JobException("Strictly local assignment requires exactly one hostname for each LocatableInputSplit.");
        }
        String hostName = lis.getHostnames()[0];
        if (hostName == null) {
            throw new JobException("For strictly local input split assignment, no null host names are allowed.");
        }
        List<LocatableInputSplit> hostSplits = splitsByHost.get(hostName);
        if (hostSplits == null) {
            hostSplits = new ArrayList<LocatableInputSplit>();
            splitsByHost.put(hostName, hostSplits);
        }
        hostSplits.add(lis);
    }
    int numHosts = splitsByHost.size();
    if (numSubTasks < numHosts) {
        throw new JobException("Strictly local split assignment requires at least as " + "many parallel subtasks as distinct split hosts. Please increase the parallelism " + "of DataSource " + this.getJobVertex().getName() + " to at least " + numHosts + ".");
    }
    // get list of hosts in deterministic order
    List<String> hosts = new ArrayList<String>(splitsByHost.keySet());
    Collections.sort(hosts);
    @SuppressWarnings("unchecked") List<LocatableInputSplit>[] subTaskSplitAssignment = (List<LocatableInputSplit>[]) new List<?>[numSubTasks];
    final int subtasksPerHost = numSubTasks / numHosts;
    final int hostsWithOneMore = numSubTasks % numHosts;
    int subtaskNum = 0;
    // over the subtasks
    for (int hostNum = 0; hostNum < numHosts; hostNum++) {
        String host = hosts.get(hostNum);
        List<LocatableInputSplit> splitsOnHost = splitsByHost.get(host);
        int numSplitsOnHost = splitsOnHost.size();
        // the number of subtasks to split this over.
        // NOTE: if the host has few splits, some subtasks will not get anything.
        int subtasks = Math.min(numSplitsOnHost, hostNum < hostsWithOneMore ? subtasksPerHost + 1 : subtasksPerHost);
        int splitsPerSubtask = numSplitsOnHost / subtasks;
        int subtasksWithOneMore = numSplitsOnHost % subtasks;
        int splitnum = 0;
        // go over the subtasks and grab a subrange of the input splits
        for (int i = 0; i < subtasks; i++) {
            int numSplitsForSubtask = (i < subtasksWithOneMore ? splitsPerSubtask + 1 : splitsPerSubtask);
            List<LocatableInputSplit> splitList;
            if (numSplitsForSubtask == numSplitsOnHost) {
                splitList = splitsOnHost;
            } else {
                splitList = new ArrayList<LocatableInputSplit>(numSplitsForSubtask);
                for (int k = 0; k < numSplitsForSubtask; k++) {
                    splitList.add(splitsOnHost.get(splitnum++));
                }
            }
            subTaskSplitAssignment[subtaskNum++] = splitList;
        }
    }
    return subTaskSplitAssignment;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) JobException(org.apache.flink.runtime.JobException) LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) ArrayList(java.util.ArrayList) List(java.util.List) InputSplit(org.apache.flink.core.io.InputSplit) LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit)

Example 2 with LocatableInputSplit

use of org.apache.flink.core.io.LocatableInputSplit in project flink by apache.

the class LocatableSplitAssignerTest method testSerialSplitAssignmentSomeForRemoteHost.

@Test
public void testSerialSplitAssignmentSomeForRemoteHost() {
    try {
        // host1 reads all local
        // host2 reads 10 local and 10 remote
        // host3 reads all remote
        final String[] hosts = { "host1", "host2", "host3" };
        final int NUM_LOCAL_HOST1_SPLITS = 20;
        final int NUM_LOCAL_HOST2_SPLITS = 10;
        final int NUM_REMOTE_SPLITS = 30;
        final int NUM_LOCAL_SPLITS = NUM_LOCAL_HOST1_SPLITS + NUM_LOCAL_HOST2_SPLITS;
        // load local splits
        int splitCnt = 0;
        Set<LocatableInputSplit> splits = new HashSet<LocatableInputSplit>();
        // host1 splits
        for (int i = 0; i < NUM_LOCAL_HOST1_SPLITS; i++) {
            splits.add(new LocatableInputSplit(splitCnt++, "host1"));
        }
        // host2 splits
        for (int i = 0; i < NUM_LOCAL_HOST2_SPLITS; i++) {
            splits.add(new LocatableInputSplit(splitCnt++, "host2"));
        }
        // load remote splits
        for (int i = 0; i < NUM_REMOTE_SPLITS; i++) {
            splits.add(new LocatableInputSplit(splitCnt++, "remoteHost"));
        }
        // get all available splits
        LocatableInputSplitAssigner ia = new LocatableInputSplitAssigner(splits);
        InputSplit is = null;
        int i = 0;
        while ((is = ia.getNextInputSplit(hosts[i++ % hosts.length], 0)) != null) {
            assertTrue(splits.remove(is));
        }
        // check we had all
        assertTrue(splits.isEmpty());
        assertNull(ia.getNextInputSplit("anotherHost", 0));
        assertEquals(NUM_REMOTE_SPLITS, ia.getNumberOfRemoteAssignments());
        assertEquals(NUM_LOCAL_SPLITS, ia.getNumberOfLocalAssignments());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) LocatableInputSplitAssigner(org.apache.flink.api.common.io.LocatableInputSplitAssigner) InputSplit(org.apache.flink.core.io.InputSplit) LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with LocatableInputSplit

use of org.apache.flink.core.io.LocatableInputSplit in project flink by apache.

the class LocatableSplitAssignerTest method testSerialSplitAssignmentWithNullHost.

@Test
public void testSerialSplitAssignmentWithNullHost() {
    try {
        final int NUM_SPLITS = 50;
        final String[][] hosts = new String[][] { new String[] { "localhost" }, new String[0], null };
        // load some splits
        Set<LocatableInputSplit> splits = new HashSet<LocatableInputSplit>();
        for (int i = 0; i < NUM_SPLITS; i++) {
            splits.add(new LocatableInputSplit(i, hosts[i % 3]));
        }
        // get all available splits
        LocatableInputSplitAssigner ia = new LocatableInputSplitAssigner(splits);
        InputSplit is = null;
        while ((is = ia.getNextInputSplit(null, 0)) != null) {
            assertTrue(splits.remove(is));
        }
        // check we had all
        assertTrue(splits.isEmpty());
        assertNull(ia.getNextInputSplit("", 0));
        assertEquals(NUM_SPLITS, ia.getNumberOfRemoteAssignments());
        assertEquals(0, ia.getNumberOfLocalAssignments());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) LocatableInputSplitAssigner(org.apache.flink.api.common.io.LocatableInputSplitAssigner) InputSplit(org.apache.flink.core.io.InputSplit) LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 4 with LocatableInputSplit

use of org.apache.flink.core.io.LocatableInputSplit in project flink by apache.

the class LocatableSplitAssignerTest method testConcurrentSplitAssignmentForMultipleHosts.

@Test
public void testConcurrentSplitAssignmentForMultipleHosts() {
    try {
        final int NUM_THREADS = 10;
        final int NUM_SPLITS = 500;
        final int SUM_OF_IDS = (NUM_SPLITS - 1) * (NUM_SPLITS) / 2;
        final String[] hosts = { "host1", "host1", "host1", "host2", "host2", "host3" };
        // load some splits
        Set<LocatableInputSplit> splits = new HashSet<LocatableInputSplit>();
        for (int i = 0; i < NUM_SPLITS; i++) {
            splits.add(new LocatableInputSplit(i, hosts[i % hosts.length]));
        }
        final LocatableInputSplitAssigner ia = new LocatableInputSplitAssigner(splits);
        final AtomicInteger splitsRetrieved = new AtomicInteger(0);
        final AtomicInteger sumOfIds = new AtomicInteger(0);
        Runnable retriever = new Runnable() {

            @Override
            public void run() {
                final String threadHost = hosts[(int) (Math.random() * hosts.length)];
                LocatableInputSplit split;
                while ((split = ia.getNextInputSplit(threadHost, 0)) != null) {
                    splitsRetrieved.incrementAndGet();
                    sumOfIds.addAndGet(split.getSplitNumber());
                }
            }
        };
        // create the threads
        Thread[] threads = new Thread[NUM_THREADS];
        for (int i = 0; i < NUM_THREADS; i++) {
            threads[i] = new Thread(retriever);
            threads[i].setDaemon(true);
        }
        // launch concurrently
        for (int i = 0; i < NUM_THREADS; i++) {
            threads[i].start();
        }
        // sync
        for (int i = 0; i < NUM_THREADS; i++) {
            threads[i].join(5000);
        }
        // verify
        for (int i = 0; i < NUM_THREADS; i++) {
            if (threads[i].isAlive()) {
                fail("The concurrency test case is erroneous, the thread did not respond in time.");
            }
        }
        assertEquals(NUM_SPLITS, splitsRetrieved.get());
        assertEquals(SUM_OF_IDS, sumOfIds.get());
        // nothing left
        assertNull(ia.getNextInputSplit("testhost", 0));
        // at least one fraction of hosts needs be local, no matter how bad the thread races
        assertTrue(ia.getNumberOfLocalAssignments() >= NUM_SPLITS / hosts.length);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) LocatableInputSplitAssigner(org.apache.flink.api.common.io.LocatableInputSplitAssigner) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 5 with LocatableInputSplit

use of org.apache.flink.core.io.LocatableInputSplit in project flink by apache.

the class LocatableSplitAssignerTest method testSerialSplitAssignmentMixedLocalHost.

@Test
public void testSerialSplitAssignmentMixedLocalHost() {
    try {
        final String[] hosts = { "host1", "host1", "host1", "host2", "host2", "host3" };
        final int NUM_SPLITS = 10 * hosts.length;
        // load some splits
        Set<LocatableInputSplit> splits = new HashSet<LocatableInputSplit>();
        for (int i = 0; i < NUM_SPLITS; i++) {
            splits.add(new LocatableInputSplit(i, hosts[i % hosts.length]));
        }
        // get all available splits
        LocatableInputSplitAssigner ia = new LocatableInputSplitAssigner(splits);
        InputSplit is = null;
        int i = 0;
        while ((is = ia.getNextInputSplit(hosts[i++ % hosts.length], 0)) != null) {
            assertTrue(splits.remove(is));
        }
        // check we had all
        assertTrue(splits.isEmpty());
        assertNull(ia.getNextInputSplit("anotherHost", 0));
        assertEquals(0, ia.getNumberOfRemoteAssignments());
        assertEquals(NUM_SPLITS, ia.getNumberOfLocalAssignments());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) LocatableInputSplitAssigner(org.apache.flink.api.common.io.LocatableInputSplitAssigner) InputSplit(org.apache.flink.core.io.InputSplit) LocatableInputSplit(org.apache.flink.core.io.LocatableInputSplit) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

LocatableInputSplit (org.apache.flink.core.io.LocatableInputSplit)11 HashSet (java.util.HashSet)10 LocatableInputSplitAssigner (org.apache.flink.api.common.io.LocatableInputSplitAssigner)10 Test (org.junit.Test)10 InputSplit (org.apache.flink.core.io.InputSplit)6 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Random (java.util.Random)1 JobException (org.apache.flink.runtime.JobException)1