Search in sources :

Example 1 with ISSTableScanner

use of org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner in project spark-cassandra-bulkreader by jberragan.

the class IndexOffsetTests method test.

private static void test(final CassandraBridge bridge, final Path dir, final Partitioner partitioner, final boolean enableCompression) throws IOException {
    final int numKeys = 500000;
    final int sparkPartitions = 128;
    final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(enableCompression).build();
    TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
        for (int i = 0; i < numKeys; i++) {
            writer.write(i, 0, i);
        }
    });
    assertEquals(1, countSSTables(dir));
    final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
    if (metadata == null) {
        throw new NullPointerException("Could not find table");
    }
    final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
    final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
    final Integer[] counts = IntStream.range(0, numKeys).map(i -> 0).boxed().toArray(Integer[]::new);
    final CassandraRing ring = TestUtils.createRing(partitioner, 32);
    // use TokenPartitioner to simulate Spark worker tokens partitions
    final TokenPartitioner tokenPartitioner = new TokenPartitioner(ring, 1, sparkPartitions);
    final List<Range<BigInteger>> ranges = tokenPartitioner.subRanges();
    LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression);
    final MutableInt skipped = new MutableInt(0);
    for (Range<BigInteger> range : ranges) {
        final FourZeroSSTableReader reader = FourZeroSSTableReader.builder(metadata, ssTable).withFilters(Collections.singletonList(SparkRangeFilter.create(range))).withStats(new Stats() {

            public void skippedPartition(ByteBuffer key, BigInteger token) {
                skipped.add(1);
            }
        }).build();
        if (reader.ignore()) {
            // we can skip this range entirely, it doesn't overlap with sstable
            continue;
        }
        // each scanner should only read tokens within it's own token range
        try (final ISSTableScanner scanner = reader.getScanner()) {
            while (scanner.hasNext()) {
                final UnfilteredRowIterator rowIterator = scanner.next();
                final int key = rowIterator.partitionKey().getKey().getInt();
                // count how many times we read a key across all 'spark' token partitions
                counts[key] = counts[key] + 1;
                while (rowIterator.hasNext()) {
                    rowIterator.next();
                }
            }
        }
    }
    // verify we read each key exactly once across all Spark partitions
    assertEquals(counts.length, numKeys);
    int idx = 0;
    for (Integer count : counts) {
        if (count == 0) {
            LOGGER.error("Missing key key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
        } else if (count > 1) {
            LOGGER.error("Key read by more than 1 Spark partition key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
        }
        assertEquals(count == 0 ? "Key not found: " + idx : "Key " + idx + " read " + count + " times", 1, count.intValue());
        idx++;
    }
    LOGGER.info("Success skippedKeys={} partitioner={}", skipped.intValue(), partitioner.name());
}
Also used : TableMetadata(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata) ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) CassandraRing(org.apache.cassandra.spark.data.partitioner.CassandraRing) TestSchema(org.apache.cassandra.spark.TestSchema) Range(com.google.common.collect.Range) ByteBuffer(java.nio.ByteBuffer) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) BigInteger(java.math.BigInteger) LocalDataLayer(org.apache.cassandra.spark.data.LocalDataLayer) DataLayer(org.apache.cassandra.spark.data.DataLayer) MutableInt(org.apache.commons.lang.mutable.MutableInt) Stats(org.apache.cassandra.spark.stats.Stats) BigInteger(java.math.BigInteger) TokenPartitioner(org.apache.cassandra.spark.data.partitioner.TokenPartitioner)

Example 2 with ISSTableScanner

use of org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner in project spark-cassandra-bulkreader by jberragan.

the class SSTableReaderTests method countAndValidateRows.

private static int countAndValidateRows(@NotNull final FourZeroSSTableReader reader) {
    final ISSTableScanner scanner = reader.getScanner();
    int count = 0;
    while (scanner.hasNext()) {
        final UnfilteredRowIterator it = scanner.next();
        while (it.hasNext()) {
            final BufferDecoratedKey key = (BufferDecoratedKey) it.partitionKey();
            final int a = key.getKey().asIntBuffer().get();
            final Unfiltered unfiltered = it.next();
            assertTrue(unfiltered.isRow());
            final AbstractRow row = (AbstractRow) unfiltered;
            final int b = row.clustering().bufferAt(0).asIntBuffer().get();
            for (final ColumnData data : row) {
                final Cell cell = (Cell) data;
                final int c = cell.buffer().getInt();
                assertEquals(c, a + b);
                count++;
            }
        }
    }
    return count;
}
Also used : ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) AbstractRow(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.AbstractRow) BufferDecoratedKey(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey) ColumnData(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.ColumnData) Cell(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Cell) Unfiltered(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Unfiltered)

Example 3 with ISSTableScanner

use of org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner in project spark-cassandra-bulkreader by jberragan.

the class TestUtils method sstableToJsonFourZero.

private static void sstableToJsonFourZero(final Path dataDbFile, final OutputStream out) throws FileNotFoundException {
    if (!Files.exists(dataDbFile)) {
        throw new FileNotFoundException("Cannot find file " + dataDbFile.toAbsolutePath());
    }
    if (!Descriptor.isValidFile(dataDbFile.toFile())) {
        throw new RuntimeException("Invalid sstable file");
    }
    final Descriptor desc = Descriptor.fromFilename(dataDbFile.toAbsolutePath().toString());
    try {
        final TableMetadataRef metadata = TableMetadataRef.forOfflineTools(Util.metadataFromSSTable(desc));
        final SSTableReader sstable = SSTableReader.openNoValidation(desc, metadata);
        final ISSTableScanner currentScanner = sstable.getScanner();
        final Stream<UnfilteredRowIterator> partitions = iterToStream(currentScanner);
        JsonTransformer.toJson(currentScanner, partitions, false, metadata.get(), out);
    } catch (final IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : ISSTableScanner(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner) UnfilteredRowIterator(org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator) SSTableReader(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.format.SSTableReader) TableMetadataRef(org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadataRef) FileNotFoundException(java.io.FileNotFoundException) Descriptor(org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.Descriptor) IOException(java.io.IOException)

Aggregations

UnfilteredRowIterator (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.UnfilteredRowIterator)3 ISSTableScanner (org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner)3 Range (com.google.common.collect.Range)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 BigInteger (java.math.BigInteger)1 ByteBuffer (java.nio.ByteBuffer)1 TestSchema (org.apache.cassandra.spark.TestSchema)1 DataLayer (org.apache.cassandra.spark.data.DataLayer)1 LocalDataLayer (org.apache.cassandra.spark.data.LocalDataLayer)1 CassandraRing (org.apache.cassandra.spark.data.partitioner.CassandraRing)1 TokenPartitioner (org.apache.cassandra.spark.data.partitioner.TokenPartitioner)1 BufferDecoratedKey (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.BufferDecoratedKey)1 AbstractRow (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.AbstractRow)1 Cell (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Cell)1 ColumnData (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.ColumnData)1 Unfiltered (org.apache.cassandra.spark.shaded.fourzero.cassandra.db.rows.Unfiltered)1 Descriptor (org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.Descriptor)1 SSTableReader (org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.format.SSTableReader)1 TableMetadata (org.apache.cassandra.spark.shaded.fourzero.cassandra.schema.TableMetadata)1