use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.
the class HashTableITCase method testSpillingHashJoinWithMassiveCollisionsIntPair.
@Test
public void testSpillingHashJoinWithMassiveCollisionsIntPair() throws IOException {
// the following two values are known to have a hash-code collision on the initial level.
// we use them to make sure one partition grows over-proportionally large
final int REPEATED_VALUE_1 = 40559;
final int REPEATED_VALUE_2 = 92882;
final int REPEATED_VALUE_COUNT_BUILD = 200000;
final int REPEATED_VALUE_COUNT_PROBE = 5;
final int NUM_KEYS = 1000000;
final int BUILD_VALS_PER_KEY = 3;
final int PROBE_VALS_PER_KEY = 10;
// create a build input that gives 3 million pairs with 3 values sharing the same key, plus 400k pairs with two colliding keys
MutableObjectIterator<IntPair> build1 = new UniformIntPairGenerator(NUM_KEYS, BUILD_VALS_PER_KEY, false);
MutableObjectIterator<IntPair> build2 = new ConstantsIntPairsIterator(REPEATED_VALUE_1, 17, REPEATED_VALUE_COUNT_BUILD);
MutableObjectIterator<IntPair> build3 = new ConstantsIntPairsIterator(REPEATED_VALUE_2, 23, REPEATED_VALUE_COUNT_BUILD);
List<MutableObjectIterator<IntPair>> builds = new ArrayList<MutableObjectIterator<IntPair>>();
builds.add(build1);
builds.add(build2);
builds.add(build3);
MutableObjectIterator<IntPair> buildInput = new UnionIterator<IntPair>(builds);
// create a probe input that gives 10 million pairs with 10 values sharing a key
MutableObjectIterator<IntPair> probe1 = new UniformIntPairGenerator(NUM_KEYS, PROBE_VALS_PER_KEY, true);
MutableObjectIterator<IntPair> probe2 = new ConstantsIntPairsIterator(REPEATED_VALUE_1, 17, 5);
MutableObjectIterator<IntPair> probe3 = new ConstantsIntPairsIterator(REPEATED_VALUE_2, 23, 5);
List<MutableObjectIterator<IntPair>> probes = new ArrayList<MutableObjectIterator<IntPair>>();
probes.add(probe1);
probes.add(probe2);
probes.add(probe3);
MutableObjectIterator<IntPair> probeInput = new UnionIterator<IntPair>(probes);
// allocate the memory for the HashTable
List<MemorySegment> memSegments;
try {
memSegments = this.memManager.allocatePages(MEM_OWNER, 896);
} catch (MemoryAllocationException maex) {
fail("Memory for the Join could not be provided.");
return;
}
// create the I/O access for spilling
IOManager ioManager = new IOManagerAsync();
// create the map for validating the results
HashMap<Integer, Long> map = new HashMap<Integer, Long>(NUM_KEYS);
// ----------------------------------------------------------------------------------------
final MutableHashTable<IntPair, IntPair> join = new MutableHashTable<IntPair, IntPair>(this.pairBuildSideAccesssor, this.pairProbeSideAccesssor, this.pairBuildSideComparator, this.pairProbeSideComparator, this.pairComparator, memSegments, ioManager);
join.open(buildInput, probeInput);
IntPair record;
final IntPair recordReuse = new IntPair();
while (join.nextRecord()) {
int numBuildValues = 0;
final IntPair probeRec = join.getCurrentProbeRecord();
int key = probeRec.getKey();
MutableObjectIterator<IntPair> buildSide = join.getBuildSideIterator();
if ((record = buildSide.next(recordReuse)) != null) {
numBuildValues = 1;
Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getKey());
} else {
fail("No build side values found for a probe key.");
}
while ((record = buildSide.next(recordReuse)) != null) {
numBuildValues++;
Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getKey());
}
Long contained = map.get(key);
if (contained == null) {
contained = Long.valueOf(numBuildValues);
} else {
contained = Long.valueOf(contained.longValue() + numBuildValues);
}
map.put(key, contained);
}
join.close();
Assert.assertEquals("Wrong number of keys", NUM_KEYS, map.size());
for (Map.Entry<Integer, Long> entry : map.entrySet()) {
long val = entry.getValue();
int key = entry.getKey();
Assert.assertEquals("Wrong number of values in per-key cross product for key " + key, (key == REPEATED_VALUE_1 || key == REPEATED_VALUE_2) ? (PROBE_VALS_PER_KEY + REPEATED_VALUE_COUNT_PROBE) * (BUILD_VALS_PER_KEY + REPEATED_VALUE_COUNT_BUILD) : PROBE_VALS_PER_KEY * BUILD_VALS_PER_KEY, val);
}
// ----------------------------------------------------------------------------------------
this.memManager.release(join.getFreedMemory());
}
use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.
the class BatchTask method initLocalStrategies.
/**
*
* NOTE: This method must be invoked after the invocation of {@code #initInputReaders()} and
* {@code #initInputSerializersAndComparators(int)}!
*/
protected void initLocalStrategies(int numInputs) throws Exception {
final MemoryManager memMan = getMemoryManager();
final IOManager ioMan = getIOManager();
this.localStrategies = new CloseableInputProvider<?>[numInputs];
this.inputs = new MutableObjectIterator<?>[numInputs];
this.excludeFromReset = new boolean[numInputs];
this.inputIsCached = new boolean[numInputs];
this.inputIsAsyncMaterialized = new boolean[numInputs];
this.materializationMemory = new int[numInputs];
// set up the local strategies first, such that the can work before any temp barrier is created
for (int i = 0; i < numInputs; i++) {
initInputLocalStrategy(i);
}
// we do another loop over the inputs, because we want to instantiate all
// sorters, etc before requesting the first input (as this call may block)
// we have two types of materialized inputs, and both are replayable (can act as a cache)
// The first variant materializes in a different thread and hence
// acts as a pipeline breaker. this one should only be there, if a pipeline breaker is needed.
// the second variant spills to the side and will not read unless the result is also consumed
// in a pipelined fashion.
this.resettableInputs = new SpillingResettableMutableObjectIterator<?>[numInputs];
this.tempBarriers = new TempBarrier<?>[numInputs];
for (int i = 0; i < numInputs; i++) {
final int memoryPages;
final boolean async = this.config.isInputAsynchronouslyMaterialized(i);
final boolean cached = this.config.isInputCached(i);
this.inputIsAsyncMaterialized[i] = async;
this.inputIsCached[i] = cached;
if (async || cached) {
memoryPages = memMan.computeNumberOfPages(this.config.getRelativeInputMaterializationMemory(i));
if (memoryPages <= 0) {
throw new Exception("Input marked as materialized/cached, but no memory for materialization provided.");
}
this.materializationMemory[i] = memoryPages;
} else {
memoryPages = 0;
}
if (async) {
@SuppressWarnings({ "unchecked", "rawtypes" }) TempBarrier<?> barrier = new TempBarrier(this, getInput(i), this.inputSerializers[i], memMan, ioMan, memoryPages);
barrier.startReading();
this.tempBarriers[i] = barrier;
this.inputs[i] = null;
} else if (cached) {
@SuppressWarnings({ "unchecked", "rawtypes" }) SpillingResettableMutableObjectIterator<?> iter = new SpillingResettableMutableObjectIterator(getInput(i), this.inputSerializers[i].getSerializer(), getMemoryManager(), getIOManager(), memoryPages, this);
this.resettableInputs[i] = iter;
this.inputs[i] = iter;
}
}
}
use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.
the class BatchTask method resetAllInputs.
protected void resetAllInputs() throws Exception {
// NOTE: we need to do this before closing the local strategies
for (int i = 0; i < this.inputs.length; i++) {
if (this.inputIsCached[i] && this.resettableInputs[i] != null) {
this.resettableInputs[i].consumeAndCacheRemainingData();
}
}
// read them now and their data is cached
for (int i = 0; i < this.localStrategies.length; i++) {
if (this.localStrategies[i] != null) {
this.localStrategies[i].close();
this.localStrategies[i] = null;
}
}
final MemoryManager memMan = getMemoryManager();
final IOManager ioMan = getIOManager();
// reset the caches, or re-run the input local strategy
for (int i = 0; i < this.inputs.length; i++) {
if (this.excludeFromReset[i]) {
if (this.tempBarriers[i] != null) {
this.tempBarriers[i].close();
this.tempBarriers[i] = null;
} else if (this.resettableInputs[i] != null) {
this.resettableInputs[i].close();
this.resettableInputs[i] = null;
}
} else {
// make sure the input is not available directly, but are lazily fetched again
this.inputs[i] = null;
if (this.inputIsCached[i]) {
if (this.tempBarriers[i] != null) {
this.inputs[i] = this.tempBarriers[i].getIterator();
} else if (this.resettableInputs[i] != null) {
this.resettableInputs[i].reset();
this.inputs[i] = this.resettableInputs[i];
} else {
throw new RuntimeException("Found a resettable input, but no temp barrier and no resettable iterator.");
}
} else {
// close the async barrier if there is one
if (this.tempBarriers[i] != null) {
this.tempBarriers[i].close();
}
// recreate the local strategy
initInputLocalStrategy(i);
if (this.inputIsAsyncMaterialized[i]) {
final int pages = this.materializationMemory[i];
@SuppressWarnings({ "unchecked", "rawtypes" }) TempBarrier<?> barrier = new TempBarrier(this, getInput(i), this.inputSerializers[i], memMan, ioMan, pages);
barrier.startReading();
this.tempBarriers[i] = barrier;
this.inputs[i] = null;
}
}
}
}
}
use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.
the class AbstractOuterJoinDriver method prepare.
@Override
public void prepare() throws Exception {
final TaskConfig config = this.taskContext.getTaskConfig();
// obtain task manager's memory manager and I/O manager
final MemoryManager memoryManager = this.taskContext.getMemoryManager();
final IOManager ioManager = this.taskContext.getIOManager();
// set up memory and I/O parameters
final double driverMemFraction = config.getRelativeMemoryDriver();
final DriverStrategy ls = config.getDriverStrategy();
final Counter numRecordsIn = this.taskContext.getMetricGroup().getIOMetricGroup().getNumRecordsInCounter();
final MutableObjectIterator<IT1> in1 = new CountingMutableObjectIterator<>(this.taskContext.<IT1>getInput(0), numRecordsIn);
final MutableObjectIterator<IT2> in2 = new CountingMutableObjectIterator<>(this.taskContext.<IT2>getInput(1), numRecordsIn);
// get serializers and comparators
final TypeSerializer<IT1> serializer1 = this.taskContext.<IT1>getInputSerializer(0).getSerializer();
final TypeSerializer<IT2> serializer2 = this.taskContext.<IT2>getInputSerializer(1).getSerializer();
final TypeComparator<IT1> comparator1 = this.taskContext.getDriverComparator(0);
final TypeComparator<IT2> comparator2 = this.taskContext.getDriverComparator(1);
final TypePairComparatorFactory<IT1, IT2> pairComparatorFactory = config.getPairComparatorFactory(this.taskContext.getUserCodeClassLoader());
if (pairComparatorFactory == null) {
throw new Exception("Missing pair comparator factory for outer join driver");
}
ExecutionConfig executionConfig = taskContext.getExecutionConfig();
boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
if (LOG.isDebugEnabled()) {
LOG.debug("Outer Join Driver object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
}
// create and return outer join iterator according to provided local strategy.
if (objectReuseEnabled) {
this.outerJoinIterator = getReusingOuterJoinIterator(ls, in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory, memoryManager, ioManager, driverMemFraction);
} else {
this.outerJoinIterator = getNonReusingOuterJoinIterator(ls, in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory, memoryManager, ioManager, driverMemFraction);
}
this.outerJoinIterator.open();
if (LOG.isDebugEnabled()) {
LOG.debug(this.taskContext.formatLogString("outer join task iterator ready."));
}
}
use of org.apache.flink.runtime.io.disk.iomanager.IOManager in project flink by apache.
the class JoinDriver method prepare.
@Override
public void prepare() throws Exception {
final TaskConfig config = this.taskContext.getTaskConfig();
final Counter numRecordsIn = this.taskContext.getMetricGroup().getIOMetricGroup().getNumRecordsInCounter();
// obtain task manager's memory manager and I/O manager
final MemoryManager memoryManager = this.taskContext.getMemoryManager();
final IOManager ioManager = this.taskContext.getIOManager();
// set up memory and I/O parameters
final double fractionAvailableMemory = config.getRelativeMemoryDriver();
final int numPages = memoryManager.computeNumberOfPages(fractionAvailableMemory);
// test minimum memory requirements
final DriverStrategy ls = config.getDriverStrategy();
final MutableObjectIterator<IT1> in1 = new CountingMutableObjectIterator<>(this.taskContext.<IT1>getInput(0), numRecordsIn);
final MutableObjectIterator<IT2> in2 = new CountingMutableObjectIterator<>(this.taskContext.<IT2>getInput(1), numRecordsIn);
// get the key positions and types
final TypeSerializer<IT1> serializer1 = this.taskContext.<IT1>getInputSerializer(0).getSerializer();
final TypeSerializer<IT2> serializer2 = this.taskContext.<IT2>getInputSerializer(1).getSerializer();
final TypeComparator<IT1> comparator1 = this.taskContext.getDriverComparator(0);
final TypeComparator<IT2> comparator2 = this.taskContext.getDriverComparator(1);
final TypePairComparatorFactory<IT1, IT2> pairComparatorFactory = config.getPairComparatorFactory(this.taskContext.getUserCodeClassLoader());
if (pairComparatorFactory == null) {
throw new Exception("Missing pair comparator factory for join driver");
}
ExecutionConfig executionConfig = taskContext.getExecutionConfig();
boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
if (LOG.isDebugEnabled()) {
LOG.debug("Join Driver object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
}
boolean hashJoinUseBitMaps = taskContext.getTaskManagerInfo().getConfiguration().getBoolean(ConfigConstants.RUNTIME_HASH_JOIN_BLOOM_FILTERS_KEY, ConfigConstants.DEFAULT_RUNTIME_HASH_JOIN_BLOOM_FILTERS);
// create and return joining iterator according to provided local strategy.
if (objectReuseEnabled) {
switch(ls) {
case INNER_MERGE:
this.joinIterator = new ReusingMergeInnerJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, numPages, this.taskContext.getContainingTask());
break;
case HYBRIDHASH_BUILD_FIRST:
this.joinIterator = new ReusingBuildFirstHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator21(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
break;
case HYBRIDHASH_BUILD_SECOND:
this.joinIterator = new ReusingBuildSecondHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
break;
default:
throw new Exception("Unsupported driver strategy for join driver: " + ls.name());
}
} else {
switch(ls) {
case INNER_MERGE:
this.joinIterator = new NonReusingMergeInnerJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, numPages, this.taskContext.getContainingTask());
break;
case HYBRIDHASH_BUILD_FIRST:
this.joinIterator = new NonReusingBuildFirstHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator21(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
break;
case HYBRIDHASH_BUILD_SECOND:
this.joinIterator = new NonReusingBuildSecondHashJoinIterator<>(in1, in2, serializer1, comparator1, serializer2, comparator2, pairComparatorFactory.createComparator12(comparator1, comparator2), memoryManager, ioManager, this.taskContext.getContainingTask(), fractionAvailableMemory, false, false, hashJoinUseBitMaps);
break;
default:
throw new Exception("Unsupported driver strategy for join driver: " + ls.name());
}
}
// open the iterator - this triggers the sorting or hash-table building
// and blocks until the iterator is ready
this.joinIterator.open();
if (LOG.isDebugEnabled()) {
LOG.debug(this.taskContext.formatLogString("join task iterator ready."));
}
}
Aggregations