Search in sources :

Example 1 with SparkRowIterator

use of org.apache.cassandra.spark.sparksql.SparkRowIterator in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testSkipPartitionsCompactionScanner.

@Test
public void testSkipPartitionsCompactionScanner() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final Set<SparkSSTableReader> readers = new HashSet<>(1);
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile), schema.buildSchema()) {

            public SSTablesSupplier sstables(final List<CustomFilter> filters) {
                return new SSTablesSupplier() {

                    public <T extends SparkSSTableReader> Set<T> openAll(ReaderOpener<T> readerOpener) {
                        return (Set<T>) readers;
                    }
                };
            }
        };
        final Range<BigInteger> sparkTokenRange;
        switch(partitioner) {
            case Murmur3Partitioner:
                sparkTokenRange = Range.closed(BigInteger.valueOf(-9223372036854775808L), BigInteger.valueOf(3074457345618258602L));
                break;
            case RandomPartitioner:
                sparkTokenRange = Range.closed(BigInteger.ZERO, new BigInteger("916176208424801638531839357843455255"));
                break;
            default:
                throw new RuntimeException("Unexpected partitioner: " + partitioner);
        }
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(sparkTokenRange);
        final AtomicBoolean pass = new AtomicBoolean(true);
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.info("Skipping partition: " + token);
                skipCount.incrementAndGet();
                if (sparkTokenRange.contains(token)) {
                    LOGGER.info("Should not skip partition: " + token);
                    pass.set(false);
                }
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), Collections.singletonList(rangeFilter), false, stats);
        readers.add(reader);
        // read the SSTable end-to-end using SparkRowIterator and verify it skips the required partitions
        // and all the partitions returned are within the Spark token range.
        final SparkRowIterator it = new SparkRowIterator(dataLayer);
        int count = 0;
        while (it.next()) {
            final InternalRow row = it.get();
            assertEquals(row.getInt(2), row.getInt(0) + row.getInt(1));
            final DecoratedKey key = FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(row.getInt(0)).flip());
            final BigInteger token = FourZeroUtils.tokenToBigInteger(key.getToken());
            assertTrue(sparkTokenRange.contains(token));
            count++;
        }
        assertTrue(skipCount.get() > 0);
        // should skip out of range partitions here
        assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, count);
        assertTrue(pass.get());
    });
}
Also used : SparkRowIterator(org.apache.cassandra.spark.sparksql.SparkRowIterator) Set(java.util.Set) HashSet(java.util.HashSet) SSTablesSupplier(org.apache.cassandra.spark.data.SSTablesSupplier) List(java.util.List) ArrayList(java.util.ArrayList) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) InternalRow(org.apache.spark.sql.catalyst.InternalRow) HashSet(java.util.HashSet) Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) DecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Aggregations

BigInteger (java.math.BigInteger)1 ByteBuffer (java.nio.ByteBuffer)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Set (java.util.Set)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 TestDataLayer (org.apache.cassandra.spark.TestDataLayer)1 TestSchema (org.apache.cassandra.spark.TestSchema)1 TestUtils.runTest (org.apache.cassandra.spark.TestUtils.runTest)1 SSTablesSupplier (org.apache.cassandra.spark.data.SSTablesSupplier)1 SparkSSTableReader (org.apache.cassandra.spark.reader.SparkSSTableReader)1 BufferDecoratedKey (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey)1 DecoratedKey (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey)1 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)1 SparkRowIterator (org.apache.cassandra.spark.sparksql.SparkRowIterator)1 SparkRangeFilter (org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter)1 Stats (org.apache.cassandra.spark.stats.Stats)1