Search in sources :

Example 1 with TestDataLayer

use of org.apache.cassandra.spark.TestDataLayer in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testSkipNoPartitions.

@Test
public void testSkipNoPartitions() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final Path summaryFile = getFirstFileType(dir, DataLayer.FileType.SUMMARY);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        SummaryDbUtils.Summary summary;
        try (final InputStream in = new BufferedInputStream(Files.newInputStream(summaryFile))) {
            summary = SummaryDbUtils.readSummary(in, metaData.partitioner, metaData.params.minIndexInterval, metaData.params.maxIndexInterval);
        }
        // set Spark token range equal to SSTable token range
        final Range<BigInteger> sparkTokenRange = Range.closed(FourZeroUtils.tokenToBigInteger(summary.first().getToken()), FourZeroUtils.tokenToBigInteger(summary.last().getToken()));
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(sparkTokenRange);
        final AtomicBoolean skipped = new AtomicBoolean(false);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.error("Skipped partition when should not: " + token);
                skipped.set(true);
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), Collections.singletonList(rangeFilter), true, stats);
        // shouldn't skip any partitions here
        assertEquals(NUM_ROWS * NUM_COLS, countAndValidateRows(reader));
        assertFalse(skipped.get());
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) BufferedInputStream(java.io.BufferedInputStream) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) BufferedInputStream(java.io.BufferedInputStream) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 2 with TestDataLayer

use of org.apache.cassandra.spark.TestDataLayer in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testPartialFilterMatch.

@Test
public void testPartialFilterMatch() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final ByteBuffer key1 = Int32Type.instance.fromString("0");
        final BigInteger token1 = bridge.hash(partitioner, key1);
        final PartitionKeyFilter keyInSSTable = PartitionKeyFilter.create(key1, token1);
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(Range.closed(token1, token1));
        final ByteBuffer key2 = Int32Type.instance.fromString("55");
        final BigInteger token2 = bridge.hash(partitioner, key2);
        final PartitionKeyFilter keyNotInSSTable = PartitionKeyFilter.create(key2, token2);
        final List<CustomFilter> filters = Arrays.asList(rangeFilter, keyInSSTable, keyNotInSSTable);
        final AtomicBoolean pass = new AtomicBoolean(true);
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.info("Skipping partition: " + token);
                skipCount.incrementAndGet();
                if (filters.stream().anyMatch(filter -> !filter.skipPartition(key, token))) {
                    LOGGER.info("Should not skip partition: " + token);
                    pass.set(false);
                }
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, false, stats);
        final int rows = countAndValidateRows(reader);
        assertTrue(skipCount.get() > 0);
        assertEquals(NUM_COLS, rows);
        // should skip partitions not matching filters
        assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, rows);
        assertTrue(pass.get());
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) PartitionKeyFilter(org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CustomFilter(org.apache.cassandra.spark.sparksql.filters.CustomFilter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 3 with TestDataLayer

use of org.apache.cassandra.spark.TestDataLayer in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testOpenSSTableReader.

@Test
public void testOpenSSTableReader() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")));
        assertNotNull(reader.firstToken());
        assertNotNull(reader.lastToken());
        assertNotNull(reader.getSSTableMetadata());
        assertFalse(reader.isRepaired());
        assertEquals(NUM_ROWS * NUM_COLS, countAndValidateRows(reader));
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) TestSchema(org.apache.cassandra.spark.TestSchema) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 4 with TestDataLayer

use of org.apache.cassandra.spark.TestDataLayer in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testSSTableRange.

@Test
public void testSSTableRange() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < 10; i++) {
                for (int j = 0; j < 1; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final SparkSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")));
        assertNotNull(reader.firstToken());
        assertNotNull(reader.lastToken());
        // verify primary Index.db file matches first and last
        final Path indexFile = getFirstFileType(dir, DataLayer.FileType.INDEX);
        final Pair<DecoratedKey, DecoratedKey> firstAndLast;
        try (final InputStream is = new BufferedInputStream(new FileInputStream(indexFile.toFile()))) {
            final Pair<ByteBuffer, ByteBuffer> keys = FourZeroUtils.readPrimaryIndex(is, true, Collections.emptyList());
            firstAndLast = Pair.of(FourZero.getPartitioner(partitioner).decorateKey(keys.getLeft()), FourZero.getPartitioner(partitioner).decorateKey(keys.getRight()));
        }
        final BigInteger first = FourZeroUtils.tokenToBigInteger(firstAndLast.getLeft().getToken());
        final BigInteger last = FourZeroUtils.tokenToBigInteger(firstAndLast.getRight().getToken());
        assertEquals(first, reader.firstToken());
        assertEquals(last, reader.lastToken());
        switch(partitioner) {
            case Murmur3Partitioner:
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.Murmur3Partitioner.minToken(), Partitioner.Murmur3Partitioner.minToken())));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.Murmur3Partitioner.minToken(), Partitioner.Murmur3Partitioner.minToken())));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-8710962479251732708L), BigInteger.valueOf(-7686143364045646507L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106294L), BigInteger.valueOf(-7509452495886106293L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106293L), BigInteger.valueOf(-7509452495886106293L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106293L), BigInteger.valueOf(2562047788015215502L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106293L), BigInteger.valueOf(9010454139840013625L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(9010454139840013625L), BigInteger.valueOf(9010454139840013625L))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.Murmur3Partitioner.maxToken(), Partitioner.Murmur3Partitioner.maxToken())));
                return;
            case RandomPartitioner:
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.RandomPartitioner.minToken(), Partitioner.RandomPartitioner.minToken())));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(0L), BigInteger.valueOf(500L))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387677"), new BigInteger("18837662806270881894834867523173387677"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387678"), new BigInteger("18837662806270881894834867523173387678"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387679"), new BigInteger("18837662806270881894834867523173387679"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387679"), new BigInteger("137731376325982006772573399291321493164"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("137731376325982006772573399291321493164"), new BigInteger("137731376325982006772573399291321493164"))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("137731376325982006772573399291321493165"), new BigInteger("137731376325982006772573399291321493165"))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.RandomPartitioner.maxToken(), Partitioner.RandomPartitioner.maxToken())));
                return;
            default:
                throw new RuntimeException("Unexpected partitioner: " + partitioner);
        }
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) BufferedInputStream(java.io.BufferedInputStream) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) FileInputStream(java.io.FileInputStream) BufferedInputStream(java.io.BufferedInputStream) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) BigInteger(java.math.BigInteger) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 5 with TestDataLayer

use of org.apache.cassandra.spark.TestDataLayer in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testSkipPartitionsCompactionScanner.

@Test
public void testSkipPartitionsCompactionScanner() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final Set<SparkSSTableReader> readers = new HashSet<>(1);
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile), schema.buildSchema()) {

            public SSTablesSupplier sstables(final List<CustomFilter> filters) {
                return new SSTablesSupplier() {

                    public <T extends SparkSSTableReader> Set<T> openAll(ReaderOpener<T> readerOpener) {
                        return (Set<T>) readers;
                    }
                };
            }
        };
        final Range<BigInteger> sparkTokenRange;
        switch(partitioner) {
            case Murmur3Partitioner:
                sparkTokenRange = Range.closed(BigInteger.valueOf(-9223372036854775808L), BigInteger.valueOf(3074457345618258602L));
                break;
            case RandomPartitioner:
                sparkTokenRange = Range.closed(BigInteger.ZERO, new BigInteger("916176208424801638531839357843455255"));
                break;
            default:
                throw new RuntimeException("Unexpected partitioner: " + partitioner);
        }
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(sparkTokenRange);
        final AtomicBoolean pass = new AtomicBoolean(true);
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.info("Skipping partition: " + token);
                skipCount.incrementAndGet();
                if (sparkTokenRange.contains(token)) {
                    LOGGER.info("Should not skip partition: " + token);
                    pass.set(false);
                }
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), Collections.singletonList(rangeFilter), false, stats);
        readers.add(reader);
        // read the SSTable end-to-end using SparkRowIterator and verify it skips the required partitions
        // and all the partitions returned are within the Spark token range.
        final SparkRowIterator it = new SparkRowIterator(dataLayer);
        int count = 0;
        while (it.next()) {
            final InternalRow row = it.get();
            assertEquals(row.getInt(2), row.getInt(0) + row.getInt(1));
            final DecoratedKey key = FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(row.getInt(0)).flip());
            final BigInteger token = FourZeroUtils.tokenToBigInteger(key.getToken());
            assertTrue(sparkTokenRange.contains(token));
            count++;
        }
        assertTrue(skipCount.get() > 0);
        // should skip out of range partitions here
        assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, count);
        assertTrue(pass.get());
    });
}
Also used : SparkRowIterator(org.apache.cassandra.spark.sparksql.SparkRowIterator) Set(java.util.Set) HashSet(java.util.HashSet) SSTablesSupplier(org.apache.cassandra.spark.data.SSTablesSupplier) List(java.util.List) ArrayList(java.util.ArrayList) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) InternalRow(org.apache.spark.sql.catalyst.InternalRow) HashSet(java.util.HashSet) Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) DecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Aggregations

Path (java.nio.file.Path)21 TestDataLayer (org.apache.cassandra.spark.TestDataLayer)21 TestSchema (org.apache.cassandra.spark.TestSchema)21 TestUtils.runTest (org.apache.cassandra.spark.TestUtils.runTest)21 Test (org.junit.Test)21 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)12 BigInteger (java.math.BigInteger)9 ByteBuffer (java.nio.ByteBuffer)8 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)8 Stats (org.apache.cassandra.spark.stats.Stats)8 Filter (org.apache.spark.sql.sources.Filter)8 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)7 SparkRangeFilter (org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter)7 List (java.util.List)6 ReplicationFactor (org.apache.cassandra.spark.data.ReplicationFactor)6 EqualTo (org.apache.spark.sql.sources.EqualTo)6 ArrayList (java.util.ArrayList)5 DecoratedKey (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey)5 CustomFilter (org.apache.cassandra.spark.sparksql.filters.CustomFilter)5 BufferedInputStream (java.io.BufferedInputStream)4