Search in sources :

Example 1 with TestSchema

use of org.apache.cassandra.spark.TestSchema in project spark-cassandra-bulkreader by jberragan.

the class IndexDbTests method testSearchIndex.

@Test
public void testSearchIndex() {
    runTest((partitioner, dir, bridge) -> {
        final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
        final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
        final int numRows = 5000;
        // write an sstable and record token
        final List<BigInteger> tokens = new ArrayList<>(numRows);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < numRows; i++) {
                final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
                final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
                tokens.add(token);
                writer.write(i, 0, i);
            }
        });
        assertEquals(1, countSSTables(dir));
        Collections.sort(tokens);
        final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
        if (metadata == null) {
            throw new NullPointerException("Could not find table");
        }
        final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
        assertNotNull(summaryDb);
        final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
        final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
        final int rowSize = 39;
        final int sample = 4;
        // sample the token list and read offset in Index.db for sampled list & verify the offset matches the expected
        // we sample the list as IndexDbUtils.findStartOffset(...) returns the previous offset, so we want to test
        // it correctly skips tokens less than the token we are looking for before returning.
        final List<BigInteger> sparseList = IntStream.range(0, tokens.size()).filter(i -> i > 0 && i % sample == 0).mapToObj(tokens::get).collect(Collectors.toList());
        assertEquals((numRows / 4) - 1, sparseList.size());
        try (final DataInputStream in = new DataInputStream(Objects.requireNonNull(ssTable.openPrimaryIndexStream()))) {
            try {
                for (int idx = 0; idx < sparseList.size(); idx++) {
                    final BigInteger token = sparseList.get(idx);
                    final long expectedOffset = (((idx + 1L) * sample) - 1) * rowSize;
                    final long offset = IndexDbUtils.findStartOffset(in, iPartitioner, Range.closed(token, token), Stats.DoNothingStats.INSTANCE);
                    assertEquals(expectedOffset, offset);
                    FourZeroUtils.skipRowIndexEntry(in);
                }
            } catch (final EOFException ignore) {
            }
        }
    });
}
Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TestSchema(org.apache.cassandra.spark.TestSchema) DataInputStream(java.io.DataInputStream) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) EOFException(java.io.EOFException) BigInteger(java.math.BigInteger) IPartitioner(org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.IPartitioner) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 2 with TestSchema

use of org.apache.cassandra.spark.TestSchema in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testSkipNoPartitions.

@Test
public void testSkipNoPartitions() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final Path summaryFile = getFirstFileType(dir, DataLayer.FileType.SUMMARY);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        SummaryDbUtils.Summary summary;
        try (final InputStream in = new BufferedInputStream(Files.newInputStream(summaryFile))) {
            summary = SummaryDbUtils.readSummary(in, metaData.partitioner, metaData.params.minIndexInterval, metaData.params.maxIndexInterval);
        }
        // set Spark token range equal to SSTable token range
        final Range<BigInteger> sparkTokenRange = Range.closed(FourZeroUtils.tokenToBigInteger(summary.first().getToken()), FourZeroUtils.tokenToBigInteger(summary.last().getToken()));
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(sparkTokenRange);
        final AtomicBoolean skipped = new AtomicBoolean(false);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.error("Skipped partition when should not: " + token);
                skipped.set(true);
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), Collections.singletonList(rangeFilter), true, stats);
        // shouldn't skip any partitions here
        assertEquals(NUM_ROWS * NUM_COLS, countAndValidateRows(reader));
        assertFalse(skipped.get());
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) BufferedInputStream(java.io.BufferedInputStream) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) BufferedInputStream(java.io.BufferedInputStream) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 3 with TestSchema

use of org.apache.cassandra.spark.TestSchema in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testPartialFilterMatch.

@Test
public void testPartialFilterMatch() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final ByteBuffer key1 = Int32Type.instance.fromString("0");
        final BigInteger token1 = bridge.hash(partitioner, key1);
        final PartitionKeyFilter keyInSSTable = PartitionKeyFilter.create(key1, token1);
        final SparkRangeFilter rangeFilter = SparkRangeFilter.create(Range.closed(token1, token1));
        final ByteBuffer key2 = Int32Type.instance.fromString("55");
        final BigInteger token2 = bridge.hash(partitioner, key2);
        final PartitionKeyFilter keyNotInSSTable = PartitionKeyFilter.create(key2, token2);
        final List<CustomFilter> filters = Arrays.asList(rangeFilter, keyInSSTable, keyNotInSSTable);
        final AtomicBoolean pass = new AtomicBoolean(true);
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedPartition(ByteBuffer key, BigInteger token) {
                LOGGER.info("Skipping partition: " + token);
                skipCount.incrementAndGet();
                if (filters.stream().anyMatch(filter -> !filter.skipPartition(key, token))) {
                    LOGGER.info("Should not skip partition: " + token);
                    pass.set(false);
                }
            }
        };
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, false, stats);
        final int rows = countAndValidateRows(reader);
        assertTrue(skipCount.get() > 0);
        assertEquals(NUM_COLS, rows);
        // should skip partitions not matching filters
        assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, rows);
        assertTrue(pass.get());
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) PartitionKeyFilter(org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CustomFilter(org.apache.cassandra.spark.sparksql.filters.CustomFilter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 4 with TestSchema

use of org.apache.cassandra.spark.TestSchema in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testOpenSSTableReader.

@Test
public void testOpenSSTableReader() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < NUM_ROWS; i++) {
                for (int j = 0; j < NUM_COLS; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")));
        assertNotNull(reader.firstToken());
        assertNotNull(reader.lastToken());
        assertNotNull(reader.getSSTableMetadata());
        assertFalse(reader.isRepaired());
        assertEquals(NUM_ROWS * NUM_COLS, countAndValidateRows(reader));
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) TestSchema(org.apache.cassandra.spark.TestSchema) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 5 with TestSchema

use of org.apache.cassandra.spark.TestSchema in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testSSTableRange.

@Test
public void testSSTableRange() {
    runTest((partitioner, dir, bridge) -> {
        // write an SSTable
        final TestSchema schema = TestSchema.basic(bridge);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < 10; i++) {
                for (int j = 0; j < 1; j++) {
                    writer.write(i, j, i + j);
                }
            }
        });
        assertEquals(1, countSSTables(dir));
        final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
        final TableMetadata metaData = schema.schemaBuilder(partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
        final SparkSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")));
        assertNotNull(reader.firstToken());
        assertNotNull(reader.lastToken());
        // verify primary Index.db file matches first and last
        final Path indexFile = getFirstFileType(dir, DataLayer.FileType.INDEX);
        final Pair<DecoratedKey, DecoratedKey> firstAndLast;
        try (final InputStream is = new BufferedInputStream(new FileInputStream(indexFile.toFile()))) {
            final Pair<ByteBuffer, ByteBuffer> keys = FourZeroUtils.readPrimaryIndex(is, true, Collections.emptyList());
            firstAndLast = Pair.of(FourZero.getPartitioner(partitioner).decorateKey(keys.getLeft()), FourZero.getPartitioner(partitioner).decorateKey(keys.getRight()));
        }
        final BigInteger first = FourZeroUtils.tokenToBigInteger(firstAndLast.getLeft().getToken());
        final BigInteger last = FourZeroUtils.tokenToBigInteger(firstAndLast.getRight().getToken());
        assertEquals(first, reader.firstToken());
        assertEquals(last, reader.lastToken());
        switch(partitioner) {
            case Murmur3Partitioner:
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.Murmur3Partitioner.minToken(), Partitioner.Murmur3Partitioner.minToken())));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.Murmur3Partitioner.minToken(), Partitioner.Murmur3Partitioner.minToken())));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-8710962479251732708L), BigInteger.valueOf(-7686143364045646507L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106294L), BigInteger.valueOf(-7509452495886106293L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106293L), BigInteger.valueOf(-7509452495886106293L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106293L), BigInteger.valueOf(2562047788015215502L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(-7509452495886106293L), BigInteger.valueOf(9010454139840013625L))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(9010454139840013625L), BigInteger.valueOf(9010454139840013625L))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.Murmur3Partitioner.maxToken(), Partitioner.Murmur3Partitioner.maxToken())));
                return;
            case RandomPartitioner:
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.RandomPartitioner.minToken(), Partitioner.RandomPartitioner.minToken())));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(BigInteger.valueOf(0L), BigInteger.valueOf(500L))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387677"), new BigInteger("18837662806270881894834867523173387677"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387678"), new BigInteger("18837662806270881894834867523173387678"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387679"), new BigInteger("18837662806270881894834867523173387679"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("18837662806270881894834867523173387679"), new BigInteger("137731376325982006772573399291321493164"))));
                assertTrue(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("137731376325982006772573399291321493164"), new BigInteger("137731376325982006772573399291321493164"))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(new BigInteger("137731376325982006772573399291321493165"), new BigInteger("137731376325982006772573399291321493165"))));
                assertFalse(SparkSSTableReader.overlaps(reader, Range.closed(Partitioner.RandomPartitioner.maxToken(), Partitioner.RandomPartitioner.maxToken())));
                return;
            default:
                throw new RuntimeException("Unexpected partitioner: " + partitioner);
        }
    });
}
Also used : Path(java.nio.file.Path) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) BufferedInputStream(java.io.BufferedInputStream) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) FileInputStream(java.io.FileInputStream) BufferedInputStream(java.io.BufferedInputStream) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) BigInteger(java.math.BigInteger) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Aggregations

TestSchema (org.apache.cassandra.spark.TestSchema)36 Test (org.junit.Test)35 Path (java.nio.file.Path)33 TestUtils.runTest (org.apache.cassandra.spark.TestUtils.runTest)30 TestDataLayer (org.apache.cassandra.spark.TestDataLayer)21 BigInteger (java.math.BigInteger)16 ByteBuffer (java.nio.ByteBuffer)16 DataLayer (org.apache.cassandra.spark.data.DataLayer)15 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)15 BufferedInputStream (java.io.BufferedInputStream)11 FileInputStream (java.io.FileInputStream)11 InputStream (java.io.InputStream)11 SparkRangeFilter (org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter)10 Stats (org.apache.cassandra.spark.stats.Stats)10 ArrayList (java.util.ArrayList)8 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)8 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)8 Filter (org.apache.spark.sql.sources.Filter)8 List (java.util.List)7 TestUtils (org.apache.cassandra.spark.TestUtils)7