Examples with DataLayer - org.apache.cassandra.spark.data.DataLayer

Example 1 with DataLayer

use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.

the class IndexDbTests method testSearchIndex.

@Test
public void testSearchIndex() {
    runTest((partitioner, dir, bridge) -> {
        final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
        final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
        final int numRows = 5000;
        // write an sstable and record token
        final List<BigInteger> tokens = new ArrayList<>(numRows);
        TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
            for (int i = 0; i < numRows; i++) {
                final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
                final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
                tokens.add(token);
                writer.write(i, 0, i);
            }
        });
        assertEquals(1, countSSTables(dir));
        Collections.sort(tokens);
        final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
        if (metadata == null) {
            throw new NullPointerException("Could not find table");
        }
        final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
        assertNotNull(summaryDb);
        final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
        final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
        final int rowSize = 39;
        final int sample = 4;
        // sample the token list and read offset in Index.db for sampled list & verify the offset matches the expected
        // we sample the list as IndexDbUtils.findStartOffset(...) returns the previous offset, so we want to test
        // it correctly skips tokens less than the token we are looking for before returning.
        final List<BigInteger> sparseList = IntStream.range(0, tokens.size()).filter(i -> i > 0 && i % sample == 0).mapToObj(tokens::get).collect(Collectors.toList());
        assertEquals((numRows / 4) - 1, sparseList.size());
        try (final DataInputStream in = new DataInputStream(Objects.requireNonNull(ssTable.openPrimaryIndexStream()))) {
            try {
                for (int idx = 0; idx < sparseList.size(); idx++) {
                    final BigInteger token = sparseList.get(idx);
                    final long expectedOffset = (((idx + 1L) * sample) - 1) * rowSize;
                    final long offset = IndexDbUtils.findStartOffset(in, iPartitioner, Range.closed(token, token), Stats.DoNothingStats.INSTANCE);
                    assertEquals(expectedOffset, offset);
                    FourZeroUtils.skipRowIndexEntry(in);
                }
            } catch (final EOFException ignore) {
            }
        }
    });
}

Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TestSchema(org.apache.cassandra.spark.TestSchema) DataInputStream(java.io.DataInputStream) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) EOFException(java.io.EOFException) BigInteger(java.math.BigInteger) IPartitioner(org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.IPartitioner) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 2 with DataLayer

use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method testIncrementalRepair.

// incremental repair
@Test
public void testIncrementalRepair() {
    runTest((partitioner, dir, bridge) -> {
        final TestSchema schema = TestSchema.basic(bridge);
        final int numSSTables = 4;
        final int numRepaired = 2;
        final int numUnRepaired = numSSTables - numRepaired;
        // write some SSTables
        for (int a = 0; a < numSSTables; a++) {
            final int pos = a * NUM_ROWS;
            TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
                for (int i = pos; i < pos + NUM_ROWS; i++) {
                    for (int j = 0; j < NUM_COLS; j++) {
                        writer.write(i, j, i + j);
                    }
                }
            });
        }
        assertEquals(numSSTables, countSSTables(dir));
        final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
        final TestDataLayer dataLayer = new TestDataLayer(bridge, getFileType(dir, DataLayer.FileType.DATA).collect(Collectors.toList()));
        final AtomicInteger skipCount = new AtomicInteger(0);
        final Stats stats = new Stats() {

            @Override
            public void skippedRepairedSSTable(DataLayer.SSTable ssTable, long repairedAt) {
                skipCount.incrementAndGet();
            }
        };
        // mark some SSTables as repaired
        final Map<DataLayer.SSTable, Boolean> isRepaired = dataLayer.listSSTables().collect(Collectors.toMap(Function.identity(), a -> false));
        int count = 0;
        for (final DataLayer.SSTable ssTable : isRepaired.keySet()) {
            if (count < numRepaired) {
                isRepaired.put(ssTable, true);
                count++;
            }
        }
        final List<FourZeroSSTableReader> primaryReaders = dataLayer.listSSTables().map(ssTable -> openIncrementalReader(metaData, ssTable, stats, true, isRepaired.get(ssTable))).filter(reader -> !reader.ignore()).collect(Collectors.toList());
        final List<FourZeroSSTableReader> nonPrimaryReaders = dataLayer.listSSTables().map(ssTable -> openIncrementalReader(metaData, ssTable, stats, false, isRepaired.get(ssTable))).filter(reader -> !reader.ignore()).collect(Collectors.toList());
        // primary repair replica should read all sstables
        assertEquals(numSSTables, primaryReaders.size());
        // non-primary repair replica should only read unrepaired sstables
        assertEquals(numUnRepaired, nonPrimaryReaders.size());
        for (final FourZeroSSTableReader reader : nonPrimaryReaders) {
            assertFalse(isRepaired.get(reader.sstable()));
        }
        assertEquals(numUnRepaired, skipCount.get());
        final Set<FourZeroSSTableReader> toCompact = Stream.concat(primaryReaders.stream().filter(r -> isRepaired.get(r.sstable())), nonPrimaryReaders.stream()).collect(Collectors.toSet());
        assertEquals(numSSTables, toCompact.size());
        int rowCount = 0;
        boolean[] found = new boolean[numSSTables * NUM_ROWS];
        try (final CompactionStreamScanner scanner = new CompactionStreamScanner(metaData, partitioner, toCompact)) {
            // iterate through CompactionScanner and verify we have all the partition keys we are looking for
            final Rid rid = scanner.getRid();
            while (scanner.hasNext()) {
                scanner.next();
                final int a = rid.getPartitionKey().asIntBuffer().get();
                found[a] = true;
                // extract clustering key value and column name
                final ByteBuffer colBuf = rid.getColumnName();
                final ByteBuffer clusteringKey = ByteBufUtils.readBytesWithShortLength(colBuf);
                colBuf.get();
                final String colName = ByteBufUtils.string(ByteBufUtils.readBytesWithShortLength(colBuf));
                colBuf.get();
                if (StringUtils.isEmpty(colName)) {
                    continue;
                }
                assertEquals("c", colName);
                final int b = clusteringKey.asIntBuffer().get();
                // extract value column
                final int c = rid.getValue().asIntBuffer().get();
                assertEquals(c, a + b);
                rowCount++;
            }
        }
        assertEquals(numSSTables * NUM_ROWS * NUM_COLS, rowCount);
        for (final boolean b : found) {
            assertTrue(b);
        }
    });
}

Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Arrays(java.util.Arrays) StringUtils(org.apache.commons.lang.StringUtils) BufferedInputStream(java.io.BufferedInputStream) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) CustomFilter(org.apache.cassandra.spark.sparksql.filters.CustomFilter) LoggerFactory(org.slf4j.LoggerFactory) AbstractRow(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.AbstractRow) SparkRowIterator(org.apache.cassandra.spark.sparksql.SparkRowIterator) ByteBuffer(java.nio.ByteBuffer) DecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.DecoratedKey) PartitionKeyFilter(org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter) Pair(org.apache.commons.lang3.tuple.Pair) ByteBufUtils(org.apache.cassandra.spark.utils.ByteBufUtils) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestUtils(org.apache.cassandra.spark.TestUtils) Map(java.util.Map) NUM_ROWS(org.apache.cassandra.spark.TestUtils.NUM_ROWS) BigInteger(java.math.BigInteger) Path(java.nio.file.Path) SSTablesSupplier(org.apache.cassandra.spark.data.SSTablesSupplier) ImmutableMap(com.google.common.collect.ImmutableMap) Range(com.google.common.collect.Range) Set(java.util.Set) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) Assert.assertFalse(org.junit.Assert.assertFalse) TestUtils.getFileType(org.apache.cassandra.spark.TestUtils.getFileType) Optional(java.util.Optional) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) NotNull(org.jetbrains.annotations.NotNull) Rid(org.apache.cassandra.spark.reader.Rid) Cell(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Cell) DataInputStream(java.io.DataInputStream) InternalRow(org.apache.spark.sql.catalyst.InternalRow) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) Unfiltered(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Unfiltered) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Function(java.util.function.Function) Int32Type(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.marshal.Int32Type) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Stats(org.apache.cassandra.spark.stats.Stats) ColumnData(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.ColumnData) Descriptor(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.Descriptor) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) Murmur3Partitioner(org.apache.cassandra.spark.shaded.fourzero.cassandra.dht.Murmur3Partitioner) SparkRangeFilter(org.apache.cassandra.spark.sparksql.filters.SparkRangeFilter) TestUtils.countSSTables(org.apache.cassandra.spark.TestUtils.countSSTables) Logger(org.slf4j.Logger) TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) Files(java.nio.file.Files) Assert.assertNotNull(org.junit.Assert.assertNotNull) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) Test(org.junit.Test) FileInputStream(java.io.FileInputStream) File(java.io.File) Assert.assertNotEquals(org.junit.Assert.assertNotEquals) DataInputPlus(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.util.DataInputPlus) DataLayer(org.apache.cassandra.spark.data.DataLayer) Partitioner(org.apache.cassandra.spark.data.partitioner.Partitioner) Paths(java.nio.file.Paths) NUM_COLS(org.apache.cassandra.spark.TestUtils.NUM_COLS) TestUtils.getFirstFileType(org.apache.cassandra.spark.TestUtils.getFirstFileType) Collections(java.util.Collections) TestSchema(org.apache.cassandra.spark.TestSchema) Assert.assertEquals(org.junit.Assert.assertEquals) InputStream(java.io.InputStream) ReplicationFactor(org.apache.cassandra.spark.data.ReplicationFactor) TestSchema(org.apache.cassandra.spark.TestSchema) Rid(org.apache.cassandra.spark.reader.Rid) ByteBuffer(java.nio.ByteBuffer) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestDataLayer(org.apache.cassandra.spark.TestDataLayer) Stats(org.apache.cassandra.spark.stats.Stats) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) TestUtils.runTest(org.apache.cassandra.spark.TestUtils.runTest) Test(org.junit.Test)

Example 3 with DataLayer

use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.

the class SparkRowIteratorTests method testRowIterator.

private static void testRowIterator(final CassandraBridge.CassandraVersion version, final TestSchema schema, final TestSchema.TestRow[] testRows) throws IOException {
    final CassandraBridge bridge = CassandraBridge.get(version);
    final CqlSchema cqlSchema = schema.buildSchema();
    final int numRows = testRows.length;
    final int numColumns = cqlSchema.fields().size() - cqlSchema.numPartitionKeys() - cqlSchema.numClusteringKeys();
    final List<CqlField> columns = cqlSchema.fields().stream().filter(f -> !f.isPartitionKey()).filter(f -> !f.isClusteringColumn()).sorted().collect(Collectors.toList());
    final Rid rid = new Rid();
    final AtomicInteger rowPos = new AtomicInteger();
    final AtomicInteger colPos = new AtomicInteger();
    // mock data layer
    final DataLayer dataLayer = mock(DataLayer.class);
    when(dataLayer.cqlSchema()).thenReturn(cqlSchema);
    when(dataLayer.version()).thenReturn(version);
    when(dataLayer.isInPartition(any(BigInteger.class), any(ByteBuffer.class))).thenReturn(true);
    when(dataLayer.bridge()).thenCallRealMethod();
    when(dataLayer.stats()).thenReturn(Stats.DoNothingStats.INSTANCE);
    when(dataLayer.requestedFeatures()).thenCallRealMethod();
    // mock scanner
    final IStreamScanner scanner = mock(IStreamScanner.class);
    when(scanner.hasNext()).thenAnswer(invocation -> rowPos.get() < numRows);
    when(scanner.getRid()).thenReturn(rid);
    doAnswer(invocation -> {
        final int col = colPos.getAndIncrement();
        final TestSchema.TestRow testRow = testRows[rowPos.get()];
        // write next partition key
        if (col == 0) {
            if (cqlSchema.numPartitionKeys() == 1) {
                final CqlField partitionKey = cqlSchema.partitionKeys().get(0);
                rid.setPartitionKeyCopy(partitionKey.serialize(testRow.get(partitionKey.pos())), BigInteger.ONE);
            } else {
                assert cqlSchema.numPartitionKeys() > 1;
                final ByteBuffer[] partitionBuffers = new ByteBuffer[cqlSchema.numPartitionKeys()];
                int pos = 0;
                for (final CqlField partitionKey : cqlSchema.partitionKeys()) {
                    partitionBuffers[pos] = partitionKey.serialize(testRow.get(partitionKey.pos()));
                    pos++;
                }
                rid.setPartitionKeyCopy(ColumnTypes.build(false, partitionBuffers), BigInteger.ONE);
            }
        }
        // write next clustering keys & column name
        final CqlField column = columns.get(col);
        final ByteBuffer[] colBuffers = new ByteBuffer[cqlSchema.numClusteringKeys() + 1];
        int pos = 0;
        for (final CqlField clusteringColumn : cqlSchema.clusteringKeys()) {
            colBuffers[pos] = clusteringColumn.serialize(testRow.get(clusteringColumn.pos()));
            pos++;
        }
        colBuffers[pos] = bridge.ascii().serialize(column.name());
        rid.setColumnNameCopy(ColumnTypes.build(false, colBuffers));
        // write value, timestamp and tombstone
        rid.setValueCopy(column.serialize(testRow.get(column.pos())));
        // move to next row
        if (colPos.get() == numColumns) {
            if (rowPos.getAndIncrement() >= numRows) {
                throw new IllegalStateException("Went too far...");
            }
            // reset column position
            colPos.set(0);
        }
        return null;
    }).when(scanner).next();
    when(dataLayer.openCompactionScanner(anyList(), any())).thenReturn(scanner);
    // use SparkRowIterator and verify values match expected
    final SparkRowIterator it = new SparkRowIterator(dataLayer);
    int rowCount = 0;
    while (it.next()) {
        while (rowCount < testRows.length && testRows[rowCount].isTombstone()) // skip tombstones
        {
            rowCount++;
        }
        if (rowCount >= testRows.length) {
            break;
        }
        final TestSchema.TestRow row = testRows[rowCount];
        assertEquals(row, schema.toTestRow(it.get()));
        rowCount++;
    }
    assertEquals(numRows, rowCount);
    it.close();
}

Also used : Rid(org.apache.cassandra.spark.reader.Rid) CassandraBridge(org.apache.cassandra.spark.reader.CassandraBridge) ColumnTypes(org.apache.cassandra.spark.utils.ColumnTypes) VersionRunner(org.apache.cassandra.spark.data.VersionRunner) IStreamScanner(org.apache.cassandra.spark.reader.IStreamScanner) ByteBuffer(java.nio.ByteBuffer) CqlField(org.apache.cassandra.spark.data.CqlField) Stats(org.apache.cassandra.spark.stats.Stats) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestUtils(org.apache.cassandra.spark.TestUtils) Mockito.doAnswer(org.mockito.Mockito.doAnswer) BigInteger(java.math.BigInteger) IOException(java.io.IOException) Test(org.junit.Test) CqlSchema(org.apache.cassandra.spark.data.CqlSchema) Mockito.when(org.mockito.Mockito.when) QuickTheory.qt(org.quicktheories.QuickTheory.qt) Collectors(java.util.stream.Collectors) Matchers.any(org.mockito.Matchers.any) List(java.util.List) DataLayer(org.apache.cassandra.spark.data.DataLayer) Matchers.anyList(org.mockito.Matchers.anyList) TestSchema(org.apache.cassandra.spark.TestSchema) Assert.assertEquals(org.junit.Assert.assertEquals) Mockito.mock(org.mockito.Mockito.mock) IStreamScanner(org.apache.cassandra.spark.reader.IStreamScanner) TestSchema(org.apache.cassandra.spark.TestSchema) CqlSchema(org.apache.cassandra.spark.data.CqlSchema) Rid(org.apache.cassandra.spark.reader.Rid) ByteBuffer(java.nio.ByteBuffer) DataLayer(org.apache.cassandra.spark.data.DataLayer) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) CqlField(org.apache.cassandra.spark.data.CqlField) BigInteger(java.math.BigInteger) CassandraBridge(org.apache.cassandra.spark.reader.CassandraBridge)

Example 4 with DataLayer

use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.

the class SingleReplicaTests method runTest.

private static void runTest(final boolean shouldThrowIOException, final SSTablesSupplier.ReaderOpener<Reader> readerOpener, final Range<BigInteger> range, final DataLayer.FileType... missingFileTypes) throws InterruptedException, IOException, ExecutionException {
    final PartitionedDataLayer dataLayer = mock(PartitionedDataLayer.class);
    final CassandraInstance instance = new CassandraInstance("-9223372036854775808", "local1-i1", "DC1");
    final DataLayer.SSTable ssTable1 = mockSSTable();
    final DataLayer.SSTable ssTable2 = mockSSTable();
    final DataLayer.SSTable ssTable3 = mockSSTable();
    for (final DataLayer.FileType fileType : missingFileTypes) {
        // verify() should throw IncompleteSSTableException when missing Statistic.db file
        when(ssTable3.isMissing(eq(fileType))).thenReturn(true);
    }
    final Stream<DataLayer.SSTable> sstables = Stream.of(ssTable1, ssTable2, ssTable3);
    when(dataLayer.listInstance(eq(0), eq(range), eq(instance))).thenReturn(CompletableFuture.completedFuture(sstables));
    final SingleReplica replica = new SingleReplica(instance, dataLayer, range, 0, EXECUTOR, true);
    final Set<Reader> readers;
    try {
        readers = replica.openReplicaAsync(readerOpener).get();
    } catch (final ExecutionException e) {
        // extract IOException and rethrow if wrapped in SSTableStreamException
        final IOException io = SSTableStreamException.getIOException(e);
        if (io != null) {
            throw io;
        }
        throw e;
    }
    if (shouldThrowIOException) {
        fail("Should throw IOException because an SSTable is corrupt");
    }
    assertEquals(3, readers.size());
}

Also used : PartitionedDataLayer(org.apache.cassandra.spark.data.PartitionedDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) PartitionedDataLayer(org.apache.cassandra.spark.data.PartitionedDataLayer) SparkSSTableReader(org.apache.cassandra.spark.reader.SparkSSTableReader) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 5 with DataLayer

use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.

the class IndexOffsetTests method test.

private static void test(final CassandraBridge bridge, final Path dir, final Partitioner partitioner, final boolean enableCompression) throws IOException {
    final int numKeys = 500000;
    final int sparkPartitions = 128;
    final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(enableCompression).build();
    TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
        for (int i = 0; i < numKeys; i++) {
            writer.write(i, 0, i);
        }
    });
    assertEquals(1, countSSTables(dir));
    final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
    if (metadata == null) {
        throw new NullPointerException("Could not find table");
    }
    final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
    final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
    final Integer[] counts = IntStream.range(0, numKeys).map(i -> 0).boxed().toArray(Integer[]::new);
    final CassandraRing ring = TestUtils.createRing(partitioner, 32);
    // use TokenPartitioner to simulate Spark worker tokens partitions
    final TokenPartitioner tokenPartitioner = new TokenPartitioner(ring, 1, sparkPartitions);
    final List<Range<BigInteger>> ranges = tokenPartitioner.subRanges();
    LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression);
    final MutableInt skipped = new MutableInt(0);
    for (Range<BigInteger> range : ranges) {
        final FourZeroSSTableReader reader = FourZeroSSTableReader.builder(metadata, ssTable).withFilters(Collections.singletonList(SparkRangeFilter.create(range))).withStats(new Stats() {

            public void skippedPartition(ByteBuffer key, BigInteger token) {
                skipped.add(1);
            }
        }).build();
        if (reader.ignore()) {
            // we can skip this range entirely, it doesn't overlap with sstable
            continue;
        }
        // each scanner should only read tokens within it's own token range
        try (final ISSTableScanner scanner = reader.getScanner()) {
            while (scanner.hasNext()) {
                final UnfilteredRowIterator rowIterator = scanner.next();
                final int key = rowIterator.partitionKey().getKey().getInt();
                // count how many times we read a key across all 'spark' token partitions
                counts[key] = counts[key] + 1;
                while (rowIterator.hasNext()) {
                    rowIterator.next();
                }
            }
        }
    }
    // verify we read each key exactly once across all Spark partitions
    assertEquals(counts.length, numKeys);
    int idx = 0;
    for (Integer count : counts) {
        if (count == 0) {
            LOGGER.error("Missing key key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
        } else if (count > 1) {
            LOGGER.error("Key read by more than 1 Spark partition key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
        }
        assertEquals(count == 0 ? "Key not found: " + idx : "Key " + idx + " read " + count + " times", 1, count.intValue());
        idx++;
    }
    LOGGER.info("Success skippedKeys={} partitioner={}", skipped.intValue(), partitioner.name());
}

Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) CassandraRing(org.apache.cassandra.spark.data.partitioner.CassandraRing) TestSchema(org.apache.cassandra.spark.TestSchema) Range(com.google.common.collect.Range) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) BigInteger(java.math.BigInteger) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) MutableInt(org.apache.commons.lang.mutable.MutableInt) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) TokenPartitioner(org.apache.cassandra.spark.data.partitioner.TokenPartitioner)

Aggregations

DataLayer (org.apache.cassandra.spark.data.DataLayer)8 TestSchema (org.apache.cassandra.spark.TestSchema)7 BigInteger (java.math.BigInteger)6 ByteBuffer (java.nio.ByteBuffer)6 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)6 Test (org.junit.Test)6 Path (java.nio.file.Path)5 TestUtils.runTest (org.apache.cassandra.spark.TestUtils.runTest)5 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 Collectors (java.util.stream.Collectors)4 TestUtils (org.apache.cassandra.spark.TestUtils)4 ImmutableMap (com.google.common.collect.ImmutableMap)3 Range (com.google.common.collect.Range)3 DataInputStream (java.io.DataInputStream)3 File (java.io.File)3 Map (java.util.Map)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 TestDataLayer (org.apache.cassandra.spark.TestDataLayer)3