Search in sources :

Example 1 with LocalDataLayer

use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.

the class IndexDbTests method testSearchIndex.

@Test
public void testSearchIndex() {
    runTest((partitioner, dir, bridge) -> {
        final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
        final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
        final int numRows = 5000;
        // write an sstable and record token
        final List<BigInteger> tokens = new ArrayList<>(numRows);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < numRows; i++) {
                final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
                final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
                tokens.add(token);
                writer.write(i, 0, i);
            }
        });
        assertEquals(1, countSSTables(dir));
        Collections.sort(tokens);
        final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
        if (metadata == null) {
            throw new NullPointerException("Could not find table");
        }
        final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
        assertNotNull(summaryDb);
        final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
        final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
        final int rowSize = 39;
        final int sample = 4;
        // sample the token list and read offset in Index.db for sampled list & verify the offset matches the expected
        // we sample the list as IndexDbUtils.findStartOffset(...) returns the previous offset, so we want to test
        // it correctly skips tokens less than the token we are looking for before returning.
        final List<BigInteger> sparseList = IntStream.range(0, tokens.size()).filter(i -> i > 0 && i % sample == 0).mapToObj(tokens::get).collect(Collectors.toList());
        assertEquals((numRows / 4) - 1, sparseList.size());
        try (final DataInputStream in = new DataInputStream(Objects.requireNonNull(ssTable.openPrimaryIndexStream()))) {
            try {
                for (int idx = 0; idx < sparseList.size(); idx++) {
                    final BigInteger token = sparseList.get(idx);
                    final long expectedOffset = (((idx + 1L) * sample) - 1) * rowSize;
                    final long offset = IndexDbUtils.findStartOffset(in, iPartitioner, Range.closed(token, token), Stats.DoNothingStats.INSTANCE);
                    assertEquals(expectedOffset, offset);
                    FourZeroUtils.skipRowIndexEntry(in);
                }
            } catch (final EOFException ignore) {
            }
        }
    });
}
Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TestSchema(org.apache.cassandra.spark.TestSchema) DataInputStream(java.io.DataInputStream) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) EOFException(java.io.EOFException) BigInteger(java.math.BigInteger) IPartitioner(org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.IPartitioner) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 2 with LocalDataLayer

use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.

the class IndexOffsetTests method test.

private static void test(final CassandraBridge bridge, final Path dir, final Partitioner partitioner, final boolean enableCompression) throws IOException {
    final int numKeys = 500000;
    final int sparkPartitions = 128;
    final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(enableCompression).build();
    TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
        for (int i = 0; i < numKeys; i++) {
            writer.write(i, 0, i);
        }
    });
    assertEquals(1, countSSTables(dir));
    final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
    if (metadata == null) {
        throw new NullPointerException("Could not find table");
    }
    final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
    final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
    final Integer[] counts = IntStream.range(0, numKeys).map(i -> 0).boxed().toArray(Integer[]::new);
    final CassandraRing ring = TestUtils.createRing(partitioner, 32);
    // use TokenPartitioner to simulate Spark worker tokens partitions
    final TokenPartitioner tokenPartitioner = new TokenPartitioner(ring, 1, sparkPartitions);
    final List<Range<BigInteger>> ranges = tokenPartitioner.subRanges();
    LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression);
    final MutableInt skipped = new MutableInt(0);
    for (Range<BigInteger> range : ranges) {
        final FourZeroSSTableReader reader = FourZeroSSTableReader.builder(metadata, ssTable).withFilters(Collections.singletonList(SparkRangeFilter.create(range))).withStats(new Stats() {

            public void skippedPartition(ByteBuffer key, BigInteger token) {
                skipped.add(1);
            }
        }).build();
        if (reader.ignore()) {
            // we can skip this range entirely, it doesn't overlap with sstable
            continue;
        }
        // each scanner should only read tokens within it's own token range
        try (final ISSTableScanner scanner = reader.getScanner()) {
            while (scanner.hasNext()) {
                final UnfilteredRowIterator rowIterator = scanner.next();
                final int key = rowIterator.partitionKey().getKey().getInt();
                // count how many times we read a key across all 'spark' token partitions
                counts[key] = counts[key] + 1;
                while (rowIterator.hasNext()) {
                    rowIterator.next();
                }
            }
        }
    }
    // verify we read each key exactly once across all Spark partitions
    assertEquals(counts.length, numKeys);
    int idx = 0;
    for (Integer count : counts) {
        if (count == 0) {
            LOGGER.error("Missing key key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
        } else if (count > 1) {
            LOGGER.error("Key read by more than 1 Spark partition key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
        }
        assertEquals(count == 0 ? "Key not found: " + idx : "Key " + idx + " read " + count + " times", 1, count.intValue());
        idx++;
    }
    LOGGER.info("Success skippedKeys={} partitioner={}", skipped.intValue(), partitioner.name());
}
Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) CassandraRing(org.apache.cassandra.spark.data.partitioner.CassandraRing) TestSchema(org.apache.cassandra.spark.TestSchema) Range(com.google.common.collect.Range) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) BigInteger(java.math.BigInteger) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) MutableInt(org.apache.commons.lang.mutable.MutableInt) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) TokenPartitioner(org.apache.cassandra.spark.data.partitioner.TokenPartitioner)

Example 3 with LocalDataLayer

use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.

the class SummaryDbTests method testSearchSummary.

@Test
public void testSearchSummary() {
    runTest((partitioner, dir, bridge) -> {
        final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
        final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
        final int numRows = 1000;
        // write an sstable and record token
        final List<BigInteger> tokens = new ArrayList<>(numRows);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < numRows; i++) {
                final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
                final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
                tokens.add(token);
                writer.write(i, 0, i);
            }
        });
        assertEquals(1, countSSTables(dir));
        Collections.sort(tokens);
        final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
        if (metadata == null) {
            throw new NullPointerException("Could not find table");
        }
        final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
        assertNotNull(summaryDb);
        final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
        final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
        // binary search Summary.db file in token order and verify offsets are ordered
        final SummaryDbUtils.Summary summary = SummaryDbUtils.readSummary(metadata, ssTable);
        long prev = -1;
        for (final BigInteger token : tokens) {
            final long offset = SummaryDbUtils.findIndexOffsetInSummary(summary.summary(), iPartitioner, token);
            if (prev == -1) {
                assertEquals(offset, 0);
            } else {
                assertTrue(prev <= offset);
            }
            prev = offset;
        }
    });
}
Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TestSchema(org.apache.cassandra.spark.TestSchema) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) BigInteger(java.math.BigInteger) IPartitioner(org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.IPartitioner) Test(org.junit.Test) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest)

Example 4 with LocalDataLayer

use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.

the class KryoSerializationTests method testLocalDataLayerThreeZero.

@Test
public void testLocalDataLayerThreeZero() {
    final String path1 = UUID.randomUUID().toString(), path2 = UUID.randomUUID().toString(), path3 = UUID.randomUUID().toString();
    final LocalDataLayer localDataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.THREEZERO, "test_keyspace", "create table test_keyspace.test_table (a int, b int, c int, primary key(a, b));", path1, path2, path3);
    final Output out = KryoSerializationTests.serialize(localDataLayer);
    final LocalDataLayer deserialized = KryoSerializationTests.deserialize(out, LocalDataLayer.class);
    assertNotNull(deserialized);
    assertEquals(localDataLayer.version(), deserialized.version());
    assertEquals(localDataLayer, deserialized);
}
Also used : Output(com.esotericsoftware.kryo.io.Output) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) Test(org.junit.Test)

Example 5 with LocalDataLayer

use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.

the class KryoSerializationTests method testLocalDataLayerFourZero.

@Test
public void testLocalDataLayerFourZero() {
    final String path1 = UUID.randomUUID().toString(), path2 = UUID.randomUUID().toString(), path3 = UUID.randomUUID().toString();
    final LocalDataLayer localDataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, "test_keyspace", "create table test_keyspace.test_table (a int, b int, c int, primary key(a, b));", path1, path2, path3);
    final Output out = KryoSerializationTests.serialize(localDataLayer);
    final LocalDataLayer deserialized = KryoSerializationTests.deserialize(out, LocalDataLayer.class);
    assertNotNull(deserialized);
    assertEquals(localDataLayer.version(), deserialized.version());
    assertEquals(localDataLayer, deserialized);
}
Also used : Output(com.esotericsoftware.kryo.io.Output) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) Test(org.junit.Test)

Aggregations

LocalDataLayer (org.apache.cassandra.spark.data.LocalDataLayer)5 Test (org.junit.Test)4 BigInteger (java.math.BigInteger)3 ByteBuffer (java.nio.ByteBuffer)3 TestSchema (org.apache.cassandra.spark.TestSchema)3 DataLayer (org.apache.cassandra.spark.data.DataLayer)3 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)3 Output (com.esotericsoftware.kryo.io.Output)2 Path (java.nio.file.Path)2 ArrayList (java.util.ArrayList)2 TestUtils.runTest (org.apache.cassandra.spark.TestUtils.runTest)2 IPartitioner (org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.IPartitioner)2 Range (com.google.common.collect.Range)1 DataInputStream (java.io.DataInputStream)1 EOFException (java.io.EOFException)1 CassandraRing (org.apache.cassandra.spark.data.partitioner.CassandraRing)1 TokenPartitioner (org.apache.cassandra.spark.data.partitioner.TokenPartitioner)1 UnfilteredRowIterator (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator)1 ISSTableScanner (org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner)1 Stats (org.apache.cassandra.spark.stats.Stats)1