Search in sources :

Example 1 with TupleComparator

use of org.apache.flink.api.java.typeutils.runtime.TupleComparator in project flink by apache.

the class LargeRecordHandler method addRecord.

// --------------------------------------------------------------------------------------------
@SuppressWarnings("unchecked")
public long addRecord(T record) throws IOException {
    if (recordsOutFile == null) {
        if (closed) {
            throw new IllegalStateException("The large record handler has been closed.");
        }
        if (recordsReader != null) {
            throw new IllegalStateException("The handler has already switched to sorting.");
        }
        LOG.debug("Initializing the large record spilling...");
        // initialize the utilities
        {
            final TypeComparator<?>[] keyComps = comparator.getFlatComparators();
            numKeyFields = keyComps.length;
            Object[] keyHolder = new Object[numKeyFields];
            comparator.extractKeys(record, keyHolder, 0);
            TypeSerializer<?>[] keySers = new TypeSerializer<?>[numKeyFields];
            TypeSerializer<?>[] tupleSers = new TypeSerializer<?>[numKeyFields + 1];
            int[] keyPos = new int[numKeyFields];
            for (int i = 0; i < numKeyFields; i++) {
                keyPos[i] = i;
                keySers[i] = createSerializer(keyHolder[i], i);
                tupleSers[i] = keySers[i];
            }
            // add the long serializer for the offset
            tupleSers[numKeyFields] = LongSerializer.INSTANCE;
            keySerializer = new TupleSerializer<>((Class<Tuple>) Tuple.getTupleClass(numKeyFields + 1), tupleSers);
            keyComparator = new TupleComparator<>(keyPos, keyComps, keySers);
            keyTuple = keySerializer.createInstance();
        }
        // initialize the spilling
        final int totalNumSegments = memory.size();
        final int segmentsForKeys = (totalNumSegments >= 2 * MAX_SEGMENTS_FOR_KEY_SPILLING) ? MAX_SEGMENTS_FOR_KEY_SPILLING : Math.max(MIN_SEGMENTS_FOR_KEY_SPILLING, totalNumSegments - MAX_SEGMENTS_FOR_KEY_SPILLING);
        List<MemorySegment> recordsMemory = new ArrayList<MemorySegment>();
        List<MemorySegment> keysMemory = new ArrayList<MemorySegment>();
        for (int i = 0; i < segmentsForKeys; i++) {
            keysMemory.add(memory.get(i));
        }
        for (int i = segmentsForKeys; i < totalNumSegments; i++) {
            recordsMemory.add(memory.get(i));
        }
        recordsChannel = ioManager.createChannel();
        keysChannel = ioManager.createChannel();
        recordsOutFile = new FileChannelOutputView(ioManager.createBlockChannelWriter(recordsChannel), memManager, recordsMemory, memManager.getPageSize());
        keysOutFile = new FileChannelOutputView(ioManager.createBlockChannelWriter(keysChannel), memManager, keysMemory, memManager.getPageSize());
    }
    final long offset = recordsOutFile.getWriteOffset();
    if (offset < 0) {
        throw new RuntimeException("wrong offset");
    }
    Object[] keyHolder = new Object[numKeyFields];
    comparator.extractKeys(record, keyHolder, 0);
    for (int i = 0; i < numKeyFields; i++) {
        keyTuple.setField(keyHolder[i], i);
    }
    keyTuple.setField(offset, numKeyFields);
    keySerializer.serialize(keyTuple, keysOutFile);
    serializer.serialize(record, recordsOutFile);
    recordCounter++;
    return offset;
}
Also used : FileChannelOutputView(org.apache.flink.runtime.io.disk.FileChannelOutputView) ArrayList(java.util.ArrayList) TupleComparator(org.apache.flink.api.java.typeutils.runtime.TupleComparator) MemorySegment(org.apache.flink.core.memory.MemorySegment) TupleSerializer(org.apache.flink.api.java.typeutils.runtime.TupleSerializer) Tuple(org.apache.flink.api.java.tuple.Tuple)

Example 2 with TupleComparator

use of org.apache.flink.api.java.typeutils.runtime.TupleComparator in project flink by apache.

the class RandomSortMergeOuterJoinTest method testOuterJoinWithHighNumberOfCommonKeys.

@SuppressWarnings("unchecked, rawtypes")
protected void testOuterJoinWithHighNumberOfCommonKeys(FlinkJoinType outerJoinType, int input1Size, int input1Duplicates, int input1ValueLength, float input1KeyDensity, int input2Size, int input2Duplicates, int input2ValueLength, float input2KeyDensity) {
    TypeComparator<Tuple2<Integer, String>> comparator1 = new TupleComparator<>(new int[] { 0 }, new TypeComparator<?>[] { new IntComparator(true) }, new TypeSerializer<?>[] { IntSerializer.INSTANCE });
    TypeComparator<Tuple2<Integer, String>> comparator2 = new TupleComparator<>(new int[] { 0 }, new TypeComparator<?>[] { new IntComparator(true) }, new TypeSerializer<?>[] { IntSerializer.INSTANCE });
    final int duplicateKey = 13;
    try {
        final TupleGenerator generator1 = new TupleGenerator(SEED1, 500, input1KeyDensity, input1ValueLength, KeyMode.SORTED_SPARSE, ValueMode.RANDOM_LENGTH, null);
        final TupleGenerator generator2 = new TupleGenerator(SEED2, 500, input2KeyDensity, input2ValueLength, KeyMode.SORTED_SPARSE, ValueMode.RANDOM_LENGTH, null);
        final TupleGeneratorIterator gen1Iter = new TupleGeneratorIterator(generator1, input1Size);
        final TupleGeneratorIterator gen2Iter = new TupleGeneratorIterator(generator2, input2Size);
        final TupleConstantValueIterator const1Iter = new TupleConstantValueIterator(duplicateKey, "LEFT String for Duplicate Keys", input1Duplicates);
        final TupleConstantValueIterator const2Iter = new TupleConstantValueIterator(duplicateKey, "RIGHT String for Duplicate Keys", input2Duplicates);
        final List<MutableObjectIterator<Tuple2<Integer, String>>> inList1 = new ArrayList<>();
        inList1.add(gen1Iter);
        inList1.add(const1Iter);
        final List<MutableObjectIterator<Tuple2<Integer, String>>> inList2 = new ArrayList<>();
        inList2.add(gen2Iter);
        inList2.add(const2Iter);
        MutableObjectIterator<Tuple2<Integer, String>> input1 = new MergeIterator<>(inList1, comparator1.duplicate());
        MutableObjectIterator<Tuple2<Integer, String>> input2 = new MergeIterator<>(inList2, comparator2.duplicate());
        // collect expected data
        final Map<Integer, Collection<Match>> expectedMatchesMap = joinValues(RandomSortMergeInnerJoinTest.collectData(input1), RandomSortMergeInnerJoinTest.collectData(input2), outerJoinType);
        // re-create the whole thing for actual processing
        // reset the generators and iterators
        generator1.reset();
        generator2.reset();
        const1Iter.reset();
        const2Iter.reset();
        gen1Iter.reset();
        gen2Iter.reset();
        inList1.clear();
        inList1.add(gen1Iter);
        inList1.add(const1Iter);
        inList2.clear();
        inList2.add(gen2Iter);
        inList2.add(const2Iter);
        input1 = new MergeIterator<>(inList1, comparator1.duplicate());
        input2 = new MergeIterator<>(inList2, comparator2.duplicate());
        StreamOperator operator = getOperator(outerJoinType);
        RandomSortMergeInnerJoinTest.match(expectedMatchesMap, RandomSortMergeInnerJoinTest.transformToBinary(myJoin(operator, input1, input2)));
        // assert that each expected match was seen
        for (Entry<Integer, Collection<Match>> entry : expectedMatchesMap.entrySet()) {
            if (!entry.getValue().isEmpty()) {
                Assert.fail("Collection for key " + entry.getKey() + " is not empty");
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail("An exception occurred during the test: " + e.getMessage());
    }
}
Also used : MutableObjectIterator(org.apache.flink.util.MutableObjectIterator) ArrayList(java.util.ArrayList) IntComparator(org.apache.flink.api.common.typeutils.base.IntComparator) TupleGenerator(org.apache.flink.runtime.operators.testutils.TestData.TupleGenerator) TupleComparator(org.apache.flink.api.java.typeutils.runtime.TupleComparator) TupleGeneratorIterator(org.apache.flink.runtime.operators.testutils.TestData.TupleGeneratorIterator) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collection(java.util.Collection) StreamOperator(org.apache.flink.streaming.api.operators.StreamOperator) TupleConstantValueIterator(org.apache.flink.runtime.operators.testutils.TestData.TupleConstantValueIterator) MergeIterator(org.apache.flink.runtime.operators.sort.MergeIterator)

Example 3 with TupleComparator

use of org.apache.flink.api.java.typeutils.runtime.TupleComparator in project flink by apache.

the class AbstractSortMergeOuterJoinIteratorITCase method testOuterJoinWithHighNumberOfCommonKeys.

@SuppressWarnings("unchecked, rawtypes")
protected void testOuterJoinWithHighNumberOfCommonKeys(OuterJoinType outerJoinType, int input1Size, int input1Duplicates, int input1ValueLength, float input1KeyDensity, int input2Size, int input2Duplicates, int input2ValueLength, float input2KeyDensity) {
    TypeSerializer<Tuple2<Integer, String>> serializer1 = new TupleSerializer<>((Class<Tuple2<Integer, String>>) (Class<?>) Tuple2.class, new TypeSerializer<?>[] { IntSerializer.INSTANCE, StringSerializer.INSTANCE });
    TypeSerializer<Tuple2<Integer, String>> serializer2 = new TupleSerializer<>((Class<Tuple2<Integer, String>>) (Class<?>) Tuple2.class, new TypeSerializer<?>[] { IntSerializer.INSTANCE, StringSerializer.INSTANCE });
    TypeComparator<Tuple2<Integer, String>> comparator1 = new TupleComparator<>(new int[] { 0 }, new TypeComparator<?>[] { new IntComparator(true) }, new TypeSerializer<?>[] { IntSerializer.INSTANCE });
    TypeComparator<Tuple2<Integer, String>> comparator2 = new TupleComparator<>(new int[] { 0 }, new TypeComparator<?>[] { new IntComparator(true) }, new TypeSerializer<?>[] { IntSerializer.INSTANCE });
    TypePairComparator<Tuple2<Integer, String>, Tuple2<Integer, String>> pairComparator = new GenericPairComparator<>(comparator1, comparator2);
    final int DUPLICATE_KEY = 13;
    try {
        final TupleGenerator generator1 = new TupleGenerator(SEED1, 500, input1KeyDensity, input1ValueLength, KeyMode.SORTED_SPARSE, ValueMode.RANDOM_LENGTH, null);
        final TupleGenerator generator2 = new TupleGenerator(SEED2, 500, input2KeyDensity, input2ValueLength, KeyMode.SORTED_SPARSE, ValueMode.RANDOM_LENGTH, null);
        final TupleGeneratorIterator gen1Iter = new TupleGeneratorIterator(generator1, input1Size);
        final TupleGeneratorIterator gen2Iter = new TupleGeneratorIterator(generator2, input2Size);
        final TupleConstantValueIterator const1Iter = new TupleConstantValueIterator(DUPLICATE_KEY, "LEFT String for Duplicate Keys", input1Duplicates);
        final TupleConstantValueIterator const2Iter = new TupleConstantValueIterator(DUPLICATE_KEY, "RIGHT String for Duplicate Keys", input2Duplicates);
        final List<MutableObjectIterator<Tuple2<Integer, String>>> inList1 = new ArrayList<>();
        inList1.add(gen1Iter);
        inList1.add(const1Iter);
        final List<MutableObjectIterator<Tuple2<Integer, String>>> inList2 = new ArrayList<>();
        inList2.add(gen2Iter);
        inList2.add(const2Iter);
        MutableObjectIterator<Tuple2<Integer, String>> input1 = new MergeIterator<>(inList1, comparator1.duplicate());
        MutableObjectIterator<Tuple2<Integer, String>> input2 = new MergeIterator<>(inList2, comparator2.duplicate());
        // collect expected data
        final Map<Integer, Collection<Match>> expectedMatchesMap = joinValues(collectData(input1), collectData(input2), outerJoinType);
        // re-create the whole thing for actual processing
        // reset the generators and iterators
        generator1.reset();
        generator2.reset();
        const1Iter.reset();
        const2Iter.reset();
        gen1Iter.reset();
        gen2Iter.reset();
        inList1.clear();
        inList1.add(gen1Iter);
        inList1.add(const1Iter);
        inList2.clear();
        inList2.add(gen2Iter);
        inList2.add(const2Iter);
        input1 = new MergeIterator<>(inList1, comparator1.duplicate());
        input2 = new MergeIterator<>(inList2, comparator2.duplicate());
        final FlatJoinFunction<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> joinFunction = new MatchRemovingJoiner(expectedMatchesMap);
        final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>();
        // we create this sort-merge iterator with little memory for the block-nested-loops
        // fall-back to make sure it
        // needs to spill for the duplicate keys
        AbstractMergeOuterJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = createOuterJoinIterator(outerJoinType, input1, input2, serializer1, comparator1, serializer2, comparator2, pairComparator, this.memoryManager, this.ioManager, PAGES_FOR_BNLJN, this.parentTask);
        iterator.open();
        while (iterator.callWithNextKey(joinFunction, collector)) ;
        iterator.close();
        // assert that each expected match was seen
        for (Entry<Integer, Collection<Match>> entry : expectedMatchesMap.entrySet()) {
            if (!entry.getValue().isEmpty()) {
                Assert.fail("Collection for key " + entry.getKey() + " is not empty");
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail("An exception occurred during the test: " + e.getMessage());
    }
}
Also used : MutableObjectIterator(org.apache.flink.util.MutableObjectIterator) ResettableMutableObjectIterator(org.apache.flink.runtime.util.ResettableMutableObjectIterator) ArrayList(java.util.ArrayList) IntComparator(org.apache.flink.api.common.typeutils.base.IntComparator) TupleComparator(org.apache.flink.api.java.typeutils.runtime.TupleComparator) MatchRemovingJoiner(org.apache.flink.runtime.operators.testutils.MatchRemovingJoiner) TupleSerializer(org.apache.flink.api.java.typeutils.runtime.TupleSerializer) GenericPairComparator(org.apache.flink.api.common.typeutils.GenericPairComparator) TupleConstantValueIterator(org.apache.flink.runtime.operators.testutils.TestData.TupleConstantValueIterator) TupleGenerator(org.apache.flink.runtime.operators.testutils.TestData.TupleGenerator) TupleGeneratorIterator(org.apache.flink.runtime.operators.testutils.TestData.TupleGeneratorIterator) DiscardingOutputCollector(org.apache.flink.runtime.operators.testutils.DiscardingOutputCollector) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collection(java.util.Collection)

Example 4 with TupleComparator

use of org.apache.flink.api.java.typeutils.runtime.TupleComparator in project flink by apache.

the class HashTableRecordWidthCombinations method main.

public static void main(String[] args) throws Exception {
    @SuppressWarnings("unchecked") final TypeSerializer<Tuple2<Long, byte[]>> buildSerializer = new TupleSerializer<Tuple2<Long, byte[]>>((Class<Tuple2<Long, byte[]>>) (Class<?>) Tuple2.class, new TypeSerializer<?>[] { LongSerializer.INSTANCE, BytePrimitiveArraySerializer.INSTANCE });
    final TypeSerializer<Long> probeSerializer = LongSerializer.INSTANCE;
    final TypeComparator<Tuple2<Long, byte[]>> buildComparator = new TupleComparator<Tuple2<Long, byte[]>>(new int[] { 0 }, new TypeComparator<?>[] { new LongComparator(true) }, new TypeSerializer<?>[] { LongSerializer.INSTANCE });
    final TypeComparator<Long> probeComparator = new LongComparator(true);
    final TypePairComparator<Long, Tuple2<Long, byte[]>> pairComparator = new TypePairComparator<Long, Tuple2<Long, byte[]>>() {

        private long ref;

        @Override
        public void setReference(Long reference) {
            ref = reference;
        }

        @Override
        public boolean equalToReference(Tuple2<Long, byte[]> candidate) {
            // noinspection UnnecessaryUnboxing
            return candidate.f0.longValue() == ref;
        }

        @Override
        public int compareToReference(Tuple2<Long, byte[]> candidate) {
            long x = ref;
            long y = candidate.f0;
            return (x < y) ? -1 : ((x == y) ? 0 : 1);
        }
    };
    try (final IOManager ioMan = new IOManagerAsync()) {
        final int pageSize = 32 * 1024;
        final int numSegments = 34;
        for (int num = 3400; num < 3550; num++) {
            final int numRecords = num;
            for (int recordLen = 270; recordLen < 320; recordLen++) {
                final byte[] payload = new byte[recordLen - 8 - 4];
                System.out.println("testing " + numRecords + " / " + recordLen);
                List<MemorySegment> memory = getMemory(numSegments, pageSize);
                // we create a hash table that thinks the records are super large. that makes it
                // choose initially
                // a lot of memory for the partition buffers, and start with a smaller hash
                // table. that way
                // we trigger a hash table growth early.
                MutableHashTable<Tuple2<Long, byte[]>, Long> table = new MutableHashTable<>(buildSerializer, probeSerializer, buildComparator, probeComparator, pairComparator, memory, ioMan, 16, false);
                final MutableObjectIterator<Tuple2<Long, byte[]>> buildInput = new MutableObjectIterator<Tuple2<Long, byte[]>>() {

                    private int count = 0;

                    @Override
                    public Tuple2<Long, byte[]> next(Tuple2<Long, byte[]> reuse) {
                        return next();
                    }

                    @Override
                    public Tuple2<Long, byte[]> next() {
                        if (count++ < numRecords) {
                            return new Tuple2<>(42L, payload);
                        } else {
                            return null;
                        }
                    }
                };
                // probe side
                final MutableObjectIterator<Long> probeInput = new MutableObjectIterator<Long>() {

                    private final long numRecords = 10000;

                    private long value = 0;

                    @Override
                    public Long next(Long aLong) {
                        return next();
                    }

                    @Override
                    public Long next() {
                        if (value < numRecords) {
                            return value++;
                        } else {
                            return null;
                        }
                    }
                };
                table.open(buildInput, probeInput);
                try {
                    while (table.nextRecord()) {
                        MutableObjectIterator<Tuple2<Long, byte[]>> matches = table.getBuildSideIterator();
                        while (matches.next() != null) {
                        }
                    }
                } catch (RuntimeException e) {
                    if (!e.getMessage().contains("exceeded maximum number of recursions")) {
                        throw e;
                    }
                } finally {
                    table.close();
                }
                // make sure no temp files are left
                checkNoTempFilesRemain(ioMan);
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : MutableObjectIterator(org.apache.flink.util.MutableObjectIterator) TupleComparator(org.apache.flink.api.java.typeutils.runtime.TupleComparator) TupleSerializer(org.apache.flink.api.java.typeutils.runtime.TupleSerializer) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TypePairComparator(org.apache.flink.api.common.typeutils.TypePairComparator) LongComparator(org.apache.flink.api.common.typeutils.base.LongComparator) MemorySegment(org.apache.flink.core.memory.MemorySegment) Tuple2(org.apache.flink.api.java.tuple.Tuple2) MutableHashTable(org.apache.flink.runtime.operators.hash.MutableHashTable)

Aggregations

TupleComparator (org.apache.flink.api.java.typeutils.runtime.TupleComparator)4 ArrayList (java.util.ArrayList)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3 TupleSerializer (org.apache.flink.api.java.typeutils.runtime.TupleSerializer)3 MutableObjectIterator (org.apache.flink.util.MutableObjectIterator)3 Collection (java.util.Collection)2 IntComparator (org.apache.flink.api.common.typeutils.base.IntComparator)2 MemorySegment (org.apache.flink.core.memory.MemorySegment)2 TupleConstantValueIterator (org.apache.flink.runtime.operators.testutils.TestData.TupleConstantValueIterator)2 TupleGenerator (org.apache.flink.runtime.operators.testutils.TestData.TupleGenerator)2 TupleGeneratorIterator (org.apache.flink.runtime.operators.testutils.TestData.TupleGeneratorIterator)2 GenericPairComparator (org.apache.flink.api.common.typeutils.GenericPairComparator)1 TypePairComparator (org.apache.flink.api.common.typeutils.TypePairComparator)1 LongComparator (org.apache.flink.api.common.typeutils.base.LongComparator)1 Tuple (org.apache.flink.api.java.tuple.Tuple)1 FileChannelOutputView (org.apache.flink.runtime.io.disk.FileChannelOutputView)1 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)1 IOManagerAsync (org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync)1 MutableHashTable (org.apache.flink.runtime.operators.hash.MutableHashTable)1 MergeIterator (org.apache.flink.runtime.operators.sort.MergeIterator)1