Search in sources :

Example 11 with IOManager

use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.

the class HashTableITCase method testSpillingHashJoinWithMassiveCollisionsIntPair.

@Test
public void testSpillingHashJoinWithMassiveCollisionsIntPair() throws IOException {
    // the following two values are known to have a hash-code collision on the initial level.
    // we use them to make sure one partition grows over-proportionally large
    final int REPEATED_VALUE_1 = 40559;
    final int REPEATED_VALUE_2 = 92882;
    final int REPEATED_VALUE_COUNT_BUILD = 200000;
    final int REPEATED_VALUE_COUNT_PROBE = 5;
    final int NUM_KEYS = 1000000;
    final int BUILD_VALS_PER_KEY = 3;
    final int PROBE_VALS_PER_KEY = 10;
    // create a build input that gives 3 million pairs with 3 values sharing the same key, plus 400k pairs with two colliding keys
    MutableObjectIterator<IntPair> build1 = new UniformIntPairGenerator(NUM_KEYS, BUILD_VALS_PER_KEY, false);
    MutableObjectIterator<IntPair> build2 = new ConstantsIntPairsIterator(REPEATED_VALUE_1, 17, REPEATED_VALUE_COUNT_BUILD);
    MutableObjectIterator<IntPair> build3 = new ConstantsIntPairsIterator(REPEATED_VALUE_2, 23, REPEATED_VALUE_COUNT_BUILD);
    List<MutableObjectIterator<IntPair>> builds = new ArrayList<MutableObjectIterator<IntPair>>();
    builds.add(build1);
    builds.add(build2);
    builds.add(build3);
    MutableObjectIterator<IntPair> buildInput = new UnionIterator<IntPair>(builds);
    // create a probe input that gives 10 million pairs with 10 values sharing a key
    MutableObjectIterator<IntPair> probe1 = new UniformIntPairGenerator(NUM_KEYS, PROBE_VALS_PER_KEY, true);
    MutableObjectIterator<IntPair> probe2 = new ConstantsIntPairsIterator(REPEATED_VALUE_1, 17, 5);
    MutableObjectIterator<IntPair> probe3 = new ConstantsIntPairsIterator(REPEATED_VALUE_2, 23, 5);
    List<MutableObjectIterator<IntPair>> probes = new ArrayList<MutableObjectIterator<IntPair>>();
    probes.add(probe1);
    probes.add(probe2);
    probes.add(probe3);
    MutableObjectIterator<IntPair> probeInput = new UnionIterator<IntPair>(probes);
    // allocate the memory for the HashTable
    List<MemorySegment> memSegments;
    try {
        memSegments = this.memManager.allocatePages(MEM_OWNER, 896);
    } catch (MemoryAllocationException maex) {
        fail("Memory for the Join could not be provided.");
        return;
    }
    // create the I/O access for spilling
    IOManager ioManager = new IOManagerAsync();
    // create the map for validating the results
    HashMap<Integer, Long> map = new HashMap<Integer, Long>(NUM_KEYS);
    // ----------------------------------------------------------------------------------------
    final MutableHashTable<IntPair, IntPair> join = new MutableHashTable<IntPair, IntPair>(this.pairBuildSideAccesssor, this.pairProbeSideAccesssor, this.pairBuildSideComparator, this.pairProbeSideComparator, this.pairComparator, memSegments, ioManager);
    join.open(buildInput, probeInput);
    IntPair record;
    final IntPair recordReuse = new IntPair();
    while (join.nextRecord()) {
        int numBuildValues = 0;
        final IntPair probeRec = join.getCurrentProbeRecord();
        int key = probeRec.getKey();
        MutableObjectIterator<IntPair> buildSide = join.getBuildSideIterator();
        if ((record = buildSide.next(recordReuse)) != null) {
            numBuildValues = 1;
            Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getKey());
        } else {
            fail("No build side values found for a probe key.");
        }
        while ((record = buildSide.next(recordReuse)) != null) {
            numBuildValues++;
            Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getKey());
        }
        Long contained = map.get(key);
        if (contained == null) {
            contained = Long.valueOf(numBuildValues);
        } else {
            contained = Long.valueOf(contained.longValue() + numBuildValues);
        }
        map.put(key, contained);
    }
    join.close();
    Assert.assertEquals("Wrong number of keys", NUM_KEYS, map.size());
    for (Map.Entry<Integer, Long> entry : map.entrySet()) {
        long val = entry.getValue();
        int key = entry.getKey();
        Assert.assertEquals("Wrong number of values in per-key cross product for key " + key, (key == REPEATED_VALUE_1 || key == REPEATED_VALUE_2) ? (PROBE_VALS_PER_KEY + REPEATED_VALUE_COUNT_PROBE) * (BUILD_VALS_PER_KEY + REPEATED_VALUE_COUNT_BUILD) : PROBE_VALS_PER_KEY * BUILD_VALS_PER_KEY, val);
    }
    // ----------------------------------------------------------------------------------------
    this.memManager.release(join.getFreedMemory());
}
Also used : MutableObjectIterator(org.apache.flink.util.MutableObjectIterator) UnionIterator(org.apache.flink.runtime.operators.testutils.UnionIterator) MemoryAllocationException(org.apache.flink.runtime.memory.MemoryAllocationException) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) IntPair(org.apache.flink.runtime.operators.testutils.types.IntPair) MemorySegment(org.apache.flink.core.memory.MemorySegment) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) HashMap(java.util.HashMap) Map(java.util.Map) UniformIntPairGenerator(org.apache.flink.runtime.operators.testutils.UniformIntPairGenerator) Test(org.junit.Test)

Example 12 with IOManager

use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.

the class BatchTask method initLocalStrategies.

/**
	 *
	 * NOTE: This method must be invoked after the invocation of {@code #initInputReaders()} and
	 * {@code #initInputSerializersAndComparators(int)}!
	 */
protected void initLocalStrategies(int numInputs) throws Exception {
    final MemoryManager memMan = getMemoryManager();
    final IOManager ioMan = getIOManager();
    this.localStrategies = new CloseableInputProvider<?>[numInputs];
    this.inputs = new MutableObjectIterator<?>[numInputs];
    this.excludeFromReset = new boolean[numInputs];
    this.inputIsCached = new boolean[numInputs];
    this.inputIsAsyncMaterialized = new boolean[numInputs];
    this.materializationMemory = new int[numInputs];
    // set up the local strategies first, such that the can work before any temp barrier is created
    for (int i = 0; i < numInputs; i++) {
        initInputLocalStrategy(i);
    }
    // we do another loop over the inputs, because we want to instantiate all
    // sorters, etc before requesting the first input (as this call may block)
    // we have two types of materialized inputs, and both are replayable (can act as a cache)
    // The first variant materializes in a different thread and hence
    // acts as a pipeline breaker. this one should only be there, if a pipeline breaker is needed.
    // the second variant spills to the side and will not read unless the result is also consumed
    // in a pipelined fashion.
    this.resettableInputs = new SpillingResettableMutableObjectIterator<?>[numInputs];
    this.tempBarriers = new TempBarrier<?>[numInputs];
    for (int i = 0; i < numInputs; i++) {
        final int memoryPages;
        final boolean async = this.config.isInputAsynchronouslyMaterialized(i);
        final boolean cached = this.config.isInputCached(i);
        this.inputIsAsyncMaterialized[i] = async;
        this.inputIsCached[i] = cached;
        if (async || cached) {
            memoryPages = memMan.computeNumberOfPages(this.config.getRelativeInputMaterializationMemory(i));
            if (memoryPages <= 0) {
                throw new Exception("Input marked as materialized/cached, but no memory for materialization provided.");
            }
            this.materializationMemory[i] = memoryPages;
        } else {
            memoryPages = 0;
        }
        if (async) {
            @SuppressWarnings({ "unchecked", "rawtypes" }) TempBarrier<?> barrier = new TempBarrier(this, getInput(i), this.inputSerializers[i], memMan, ioMan, memoryPages);
            barrier.startReading();
            this.tempBarriers[i] = barrier;
            this.inputs[i] = null;
        } else if (cached) {
            @SuppressWarnings({ "unchecked", "rawtypes" }) SpillingResettableMutableObjectIterator<?> iter = new SpillingResettableMutableObjectIterator(getInput(i), this.inputSerializers[i].getSerializer(), getMemoryManager(), getIOManager(), memoryPages, this);
            this.resettableInputs[i] = iter;
            this.inputs[i] = iter;
        }
    }
}
Also used : IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) SpillingResettableMutableObjectIterator(org.apache.flink.runtime.operators.resettable.SpillingResettableMutableObjectIterator) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) ExceptionInChainedStubException(org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) IOException(java.io.IOException)

Example 13 with IOManager

use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.

the class BatchTask method resetAllInputs.

protected void resetAllInputs() throws Exception {
    // NOTE: we need to do this before closing the local strategies
    for (int i = 0; i < this.inputs.length; i++) {
        if (this.inputIsCached[i] && this.resettableInputs[i] != null) {
            this.resettableInputs[i].consumeAndCacheRemainingData();
        }
    }
    // read them now and their data is cached
    for (int i = 0; i < this.localStrategies.length; i++) {
        if (this.localStrategies[i] != null) {
            this.localStrategies[i].close();
            this.localStrategies[i] = null;
        }
    }
    final MemoryManager memMan = getMemoryManager();
    final IOManager ioMan = getIOManager();
    // reset the caches, or re-run the input local strategy
    for (int i = 0; i < this.inputs.length; i++) {
        if (this.excludeFromReset[i]) {
            if (this.tempBarriers[i] != null) {
                this.tempBarriers[i].close();
                this.tempBarriers[i] = null;
            } else if (this.resettableInputs[i] != null) {
                this.resettableInputs[i].close();
                this.resettableInputs[i] = null;
            }
        } else {
            // make sure the input is not available directly, but are lazily fetched again
            this.inputs[i] = null;
            if (this.inputIsCached[i]) {
                if (this.tempBarriers[i] != null) {
                    this.inputs[i] = this.tempBarriers[i].getIterator();
                } else if (this.resettableInputs[i] != null) {
                    this.resettableInputs[i].reset();
                    this.inputs[i] = this.resettableInputs[i];
                } else {
                    throw new RuntimeException("Found a resettable input, but no temp barrier and no resettable iterator.");
                }
            } else {
                // close the async barrier if there is one
                if (this.tempBarriers[i] != null) {
                    this.tempBarriers[i].close();
                }
                // recreate the local strategy
                initInputLocalStrategy(i);
                if (this.inputIsAsyncMaterialized[i]) {
                    final int pages = this.materializationMemory[i];
                    @SuppressWarnings({ "unchecked", "rawtypes" }) TempBarrier<?> barrier = new TempBarrier(this, getInput(i), this.inputSerializers[i], memMan, ioMan, pages);
                    barrier.startReading();
                    this.tempBarriers[i] = barrier;
                    this.inputs[i] = null;
                }
            }
        }
    }
}
Also used : IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) MemoryManager(org.apache.flink.runtime.memory.MemoryManager)

Example 14 with IOManager

use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.

the class AbstractOuterJoinDriver method prepare.

@Override
public void prepare() throws Exception {
    final TaskConfig config = this.taskContext.getTaskConfig();
    // obtain task manager's memory manager and I/O manager
    final MemoryManager memoryManager = this.taskContext.getMemoryManager();
    final IOManager ioManager = this.taskContext.getIOManager();
    // set up memory and I/O parameters
    final double driverMemFraction = config.getRelativeMemoryDriver();
    final DriverStrategy ls = config.getDriverStrategy();
    final Counter numRecordsIn = this.taskContext.getMetricGroup().getIOMetricGroup().getNumRecordsInCounter();
    final MutableObjectIterator<IT1> in1 = new CountingMutableObjectIterator<>(this.taskContext.<IT1>getInput(0), numRecordsIn);
    final MutableObjectIterator<IT2> in2 = new CountingMutableObjectIterator<>(this.taskContext.<IT2>getInput(1), numRecordsIn);
    // get serializers and comparators
    final TypeSerializer<IT1> serializer1 = this.taskContext.<IT1>getInputSerializer(0).getSerializer();
    final TypeSerializer<IT2> serializer2 = this.taskContext.<IT2>getInputSerializer(1).getSerializer();
    final TypeComparator<IT1> comparator1 = this.taskContext.getDriverComparator(0);
    final TypeComparator<IT2> comparator2 = this.taskContext.getDriverComparator(1);
    final TypePairComparatorFactory<IT1, IT2> pairComparatorFactory = config.getPairComparatorFactory(this.taskContext.getUserCodeClassLoader());
    if (pairComparatorFactory == null) {
        throw new Exception("Missing pair comparator factory for outer join driver");
    }
    ExecutionConfig executionConfig = taskContext.getExecutionConfig();
    boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Outer Join Driver object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
    }
    // create and return outer join iterator according to provided local strategy.
    if (objectReuseEnabled) {
        this.outerJoinIterator = getReusingOuterJoinIterator(ls, in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory, memoryManager, ioManager, driverMemFraction);
    } else {
        this.outerJoinIterator = getNonReusingOuterJoinIterator(ls, in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory, memoryManager, ioManager, driverMemFraction);
    }
    this.outerJoinIterator.open();
    if (LOG.isDebugEnabled()) {
        LOG.debug(this.taskContext.formatLogString("outer join task iterator ready."));
    }
}
Also used : IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskConfig(org.apache.flink.runtime.operators.util.TaskConfig) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) Counter(org.apache.flink.metrics.Counter) CountingMutableObjectIterator(org.apache.flink.runtime.operators.util.metrics.CountingMutableObjectIterator)

Example 15 with IOManager

use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.

the class JoinDriver method prepare.

@Override
public void prepare() throws Exception {
    final TaskConfig config = this.taskContext.getTaskConfig();
    final Counter numRecordsIn = this.taskContext.getMetricGroup().getIOMetricGroup().getNumRecordsInCounter();
    // obtain task manager's memory manager and I/O manager
    final MemoryManager memoryManager = this.taskContext.getMemoryManager();
    final IOManager ioManager = this.taskContext.getIOManager();
    // set up memory and I/O parameters
    final double fractionAvailableMemory = config.getRelativeMemoryDriver();
    final int numPages = memoryManager.computeNumberOfPages(fractionAvailableMemory);
    // test minimum memory requirements
    final DriverStrategy ls = config.getDriverStrategy();
    final MutableObjectIterator<IT1> in1 = new CountingMutableObjectIterator<>(this.taskContext.<IT1>getInput(0), numRecordsIn);
    final MutableObjectIterator<IT2> in2 = new CountingMutableObjectIterator<>(this.taskContext.<IT2>getInput(1), numRecordsIn);
    // get the key positions and types
    final TypeSerializer<IT1> serializer1 = this.taskContext.<IT1>getInputSerializer(0).getSerializer();
    final TypeSerializer<IT2> serializer2 = this.taskContext.<IT2>getInputSerializer(1).getSerializer();
    final TypeComparator<IT1> comparator1 = this.taskContext.getDriverComparator(0);
    final TypeComparator<IT2> comparator2 = this.taskContext.getDriverComparator(1);
    final TypePairComparatorFactory<IT1, IT2> pairComparatorFactory = config.getPairComparatorFactory(this.taskContext.getUserCodeClassLoader());
    if (pairComparatorFactory == null) {
        throw new Exception("Missing pair comparator factory for join driver");
    }
    ExecutionConfig executionConfig = taskContext.getExecutionConfig();
    boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Join Driver object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
    }
    boolean hashJoinUseBitMaps = taskContext.getTaskManagerInfo().getConfiguration().getBoolean(ConfigConstants.RUNTIME_HASH_JOIN_BLOOM_FILTERS_KEY, ConfigConstants.DEFAULT_RUNTIME_HASH_JOIN_BLOOM_FILTERS);
    // create and return joining iterator according to provided local strategy.
    if (objectReuseEnabled) {
        switch(ls) {
            case INNER_MERGE:
                this.joinIterator = new ReusingMergeInnerJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, numPages, this.taskContext.getContainingTask());
                break;
            case HYBRIDHASH_BUILD_FIRST:
                this.joinIterator = new ReusingBuildFirstHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator21(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
                break;
            case HYBRIDHASH_BUILD_SECOND:
                this.joinIterator = new ReusingBuildSecondHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
                break;
            default:
                throw new Exception("Unsupported driver strategy for join driver: " + ls.name());
        }
    } else {
        switch(ls) {
            case INNER_MERGE:
                this.joinIterator = new NonReusingMergeInnerJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, numPages, this.taskContext.getContainingTask());
                break;
            case HYBRIDHASH_BUILD_FIRST:
                this.joinIterator = new NonReusingBuildFirstHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator21(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
                break;
            case HYBRIDHASH_BUILD_SECOND:
                this.joinIterator = new NonReusingBuildSecondHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
                break;
            default:
                throw new Exception("Unsupported driver strategy for join driver: " + ls.name());
        }
    }
    // open the iterator - this triggers the sorting or hash-table building
    // and blocks until the iterator is ready
    this.joinIterator.open();
    if (LOG.isDebugEnabled()) {
        LOG.debug(this.taskContext.formatLogString("join task iterator ready."));
    }
}
Also used : TaskConfig(org.apache.flink.runtime.operators.util.TaskConfig) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Counter(org.apache.flink.metrics.Counter) CountingMutableObjectIterator(org.apache.flink.runtime.operators.util.metrics.CountingMutableObjectIterator) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) MemoryManager(org.apache.flink.runtime.memory.MemoryManager)

Aggregations

IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)30 IOManagerAsync (org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync)22 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)19 Test (org.junit.Test)19 MemorySegment (org.apache.flink.core.memory.MemorySegment)15 DummyInvokable (org.apache.flink.runtime.operators.testutils.DummyInvokable)12 IOException (java.io.IOException)10 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)9 TupleTypeInfo (org.apache.flink.api.java.typeutils.TupleTypeInfo)7 File (java.io.File)6 ArrayList (java.util.ArrayList)6 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)6 AbstractInvokable (org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable)5 BufferedReader (java.io.BufferedReader)4 FileReader (java.io.FileReader)4 Random (java.util.Random)4 JobID (org.apache.flink.api.common.JobID)3 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)3 RuntimeSerializerFactory (org.apache.flink.api.java.typeutils.runtime.RuntimeSerializerFactory)3 Configuration (org.apache.flink.configuration.Configuration)3