use of org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner in project spark-cassandra-bulkreader by jberragan.
the class IndexOffsetTests method test.
private static void test(final CassandraBridge bridge, final Path dir, final Partitioner partitioner, final boolean enableCompression) throws IOException {
final int numKeys = 500000;
final int sparkPartitions = 128;
final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(enableCompression).build();
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < numKeys; i++) {
writer.write(i, 0, i);
}
});
assertEquals(1, countSSTables(dir));
final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
if (metadata == null) {
throw new NullPointerException("Could not find table");
}
final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
final Integer[] counts = IntStream.range(0, numKeys).map(i -> 0).boxed().toArray(Integer[]::new);
final CassandraRing ring = TestUtils.createRing(partitioner, 32);
// use TokenPartitioner to simulate Spark worker tokens partitions
final TokenPartitioner tokenPartitioner = new TokenPartitioner(ring, 1, sparkPartitions);
final List<Range<BigInteger>> ranges = tokenPartitioner.subRanges();
LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression);
final MutableInt skipped = new MutableInt(0);
for (Range<BigInteger> range : ranges) {
final FourZeroSSTableReader reader = FourZeroSSTableReader.builder(metadata, ssTable).withFilters(Collections.singletonList(SparkRangeFilter.create(range))).withStats(new Stats() {
public void skippedPartition(ByteBuffer key, BigInteger token) {
skipped.add(1);
}
}).build();
if (reader.ignore()) {
// we can skip this range entirely, it doesn't overlap with sstable
continue;
}
// each scanner should only read tokens within it's own token range
try (final ISSTableScanner scanner = reader.getScanner()) {
while (scanner.hasNext()) {
final UnfilteredRowIterator rowIterator = scanner.next();
final int key = rowIterator.partitionKey().getKey().getInt();
// count how many times we read a key across all 'spark' token partitions
counts[key] = counts[key] + 1;
while (rowIterator.hasNext()) {
rowIterator.next();
}
}
}
}
// verify we read each key exactly once across all Spark partitions
assertEquals(counts.length, numKeys);
int idx = 0;
for (Integer count : counts) {
if (count == 0) {
LOGGER.error("Missing key key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
} else if (count > 1) {
LOGGER.error("Key read by more than 1 Spark partition key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
}
assertEquals(count == 0 ? "Key not found: " + idx : "Key " + idx + " read " + count + " times", 1, count.intValue());
idx++;
}
LOGGER.info("Success skippedKeys={} partitioner={}", skipped.intValue(), partitioner.name());
}
use of org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method countAndValidateRows.
private static int countAndValidateRows(@NotNull final FourZeroSSTableReader reader) {
final ISSTableScanner scanner = reader.getScanner();
int count = 0;
while (scanner.hasNext()) {
final UnfilteredRowIterator it = scanner.next();
while (it.hasNext()) {
final BufferDecoratedKey key = (BufferDecoratedKey) it.partitionKey();
final int a = key.getKey().asIntBuffer().get();
final Unfiltered unfiltered = it.next();
assertTrue(unfiltered.isRow());
final AbstractRow row = (AbstractRow) unfiltered;
final int b = row.clustering().bufferAt(0).asIntBuffer().get();
for (final ColumnData data : row) {
final Cell cell = (Cell) data;
final int c = cell.buffer().getInt();
assertEquals(c, a + b);
count++;
}
}
}
return count;
}
use of org.apache.cassandra.spark.shaded.fourzero.cassandra.io.sstable.ISSTableScanner in project spark-cassandra-bulkreader by jberragan.
the class TestUtils method sstableToJsonFourZero.
private static void sstableToJsonFourZero(final Path dataDbFile, final OutputStream out) throws FileNotFoundException {
if (!Files.exists(dataDbFile)) {
throw new FileNotFoundException("Cannot find file " + dataDbFile.toAbsolutePath());
}
if (!Descriptor.isValidFile(dataDbFile.toFile())) {
throw new RuntimeException("Invalid sstable file");
}
final Descriptor desc = Descriptor.fromFilename(dataDbFile.toAbsolutePath().toString());
try {
final TableMetadataRef metadata = TableMetadataRef.forOfflineTools(Util.metadataFromSSTable(desc));
final SSTableReader sstable = SSTableReader.openNoValidation(desc, metadata);
final ISSTableScanner currentScanner = sstable.getScanner();
final Stream<UnfilteredRowIterator> partitions = iterToStream(currentScanner);
JsonTransformer.toJson(currentScanner, partitions, false, metadata.get(), out);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
Aggregations