use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.
the class IndexDbTests method testSearchIndex.
@Test
public void testSearchIndex() {
runTest((partitioner, dir, bridge) -> {
final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
final int numRows = 5000;
// write an sstable and record token
final List<BigInteger> tokens = new ArrayList<>(numRows);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < numRows; i++) {
final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
tokens.add(token);
writer.write(i, 0, i);
}
});
assertEquals(1, countSSTables(dir));
Collections.sort(tokens);
final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
if (metadata == null) {
throw new NullPointerException("Could not find table");
}
final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
assertNotNull(summaryDb);
final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
final int rowSize = 39;
final int sample = 4;
// sample the token list and read offset in Index.db for sampled list & verify the offset matches the expected
// we sample the list as IndexDbUtils.findStartOffset(...) returns the previous offset, so we want to test
// it correctly skips tokens less than the token we are looking for before returning.
final List<BigInteger> sparseList = IntStream.range(0, tokens.size()).filter(i -> i > 0 && i % sample == 0).mapToObj(tokens::get).collect(Collectors.toList());
assertEquals((numRows / 4) - 1, sparseList.size());
try (final DataInputStream in = new DataInputStream(Objects.requireNonNull(ssTable.openPrimaryIndexStream()))) {
try {
for (int idx = 0; idx < sparseList.size(); idx++) {
final BigInteger token = sparseList.get(idx);
final long expectedOffset = (((idx + 1L) * sample) - 1) * rowSize;
final long offset = IndexDbUtils.findStartOffset(in, iPartitioner, Range.closed(token, token), Stats.DoNothingStats.INSTANCE);
assertEquals(expectedOffset, offset);
FourZeroUtils.skipRowIndexEntry(in);
}
} catch (final EOFException ignore) {
}
}
});
}
use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.
the class IndexOffsetTests method test.
private static void test(final CassandraBridge bridge, final Path dir, final Partitioner partitioner, final boolean enableCompression) throws IOException {
final int numKeys = 500000;
final int sparkPartitions = 128;
final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(enableCompression).build();
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < numKeys; i++) {
writer.write(i, 0, i);
}
});
assertEquals(1, countSSTables(dir));
final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
if (metadata == null) {
throw new NullPointerException("Could not find table");
}
final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
final Integer[] counts = IntStream.range(0, numKeys).map(i -> 0).boxed().toArray(Integer[]::new);
final CassandraRing ring = TestUtils.createRing(partitioner, 32);
// use TokenPartitioner to simulate Spark worker tokens partitions
final TokenPartitioner tokenPartitioner = new TokenPartitioner(ring, 1, sparkPartitions);
final List<Range<BigInteger>> ranges = tokenPartitioner.subRanges();
LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression);
final MutableInt skipped = new MutableInt(0);
for (Range<BigInteger> range : ranges) {
final FourZeroSSTableReader reader = FourZeroSSTableReader.builder(metadata, ssTable).withFilters(Collections.singletonList(SparkRangeFilter.create(range))).withStats(new Stats() {
public void skippedPartition(ByteBuffer key, BigInteger token) {
skipped.add(1);
}
}).build();
if (reader.ignore()) {
// we can skip this range entirely, it doesn't overlap with sstable
continue;
}
// each scanner should only read tokens within it's own token range
try (final ISSTableScanner scanner = reader.getScanner()) {
while (scanner.hasNext()) {
final UnfilteredRowIterator rowIterator = scanner.next();
final int key = rowIterator.partitionKey().getKey().getInt();
// count how many times we read a key across all 'spark' token partitions
counts[key] = counts[key] + 1;
while (rowIterator.hasNext()) {
rowIterator.next();
}
}
}
}
// verify we read each key exactly once across all Spark partitions
assertEquals(counts.length, numKeys);
int idx = 0;
for (Integer count : counts) {
if (count == 0) {
LOGGER.error("Missing key key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
} else if (count > 1) {
LOGGER.error("Key read by more than 1 Spark partition key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
}
assertEquals(count == 0 ? "Key not found: " + idx : "Key " + idx + " read " + count + " times", 1, count.intValue());
idx++;
}
LOGGER.info("Success skippedKeys={} partitioner={}", skipped.intValue(), partitioner.name());
}
use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.
the class SummaryDbTests method testSearchSummary.
@Test
public void testSearchSummary() {
runTest((partitioner, dir, bridge) -> {
final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
final int numRows = 1000;
// write an sstable and record token
final List<BigInteger> tokens = new ArrayList<>(numRows);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < numRows; i++) {
final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
tokens.add(token);
writer.write(i, 0, i);
}
});
assertEquals(1, countSSTables(dir));
Collections.sort(tokens);
final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
if (metadata == null) {
throw new NullPointerException("Could not find table");
}
final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
assertNotNull(summaryDb);
final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
// binary search Summary.db file in token order and verify offsets are ordered
final SummaryDbUtils.Summary summary = SummaryDbUtils.readSummary(metadata, ssTable);
long prev = -1;
for (final BigInteger token : tokens) {
final long offset = SummaryDbUtils.findIndexOffsetInSummary(summary.summary(), iPartitioner, token);
if (prev == -1) {
assertEquals(offset, 0);
} else {
assertTrue(prev <= offset);
}
prev = offset;
}
});
}
use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.
the class KryoSerializationTests method testLocalDataLayerThreeZero.
@Test
public void testLocalDataLayerThreeZero() {
final String path1 = UUID.randomUUID().toString(), path2 = UUID.randomUUID().toString(), path3 = UUID.randomUUID().toString();
final LocalDataLayer localDataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.THREEZERO, "test_keyspace", "create table test_keyspace.test_table (a int, b int, c int, primary key(a, b));", path1, path2, path3);
final Output out = KryoSerializationTests.serialize(localDataLayer);
final LocalDataLayer deserialized = KryoSerializationTests.deserialize(out, LocalDataLayer.class);
assertNotNull(deserialized);
assertEquals(localDataLayer.version(), deserialized.version());
assertEquals(localDataLayer, deserialized);
}
use of org.apache.cassandra.spark.data.LocalDataLayer in project spark-cassandra-bulkreader by jberragan.
the class KryoSerializationTests method testLocalDataLayerFourZero.
@Test
public void testLocalDataLayerFourZero() {
final String path1 = UUID.randomUUID().toString(), path2 = UUID.randomUUID().toString(), path3 = UUID.randomUUID().toString();
final LocalDataLayer localDataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, "test_keyspace", "create table test_keyspace.test_table (a int, b int, c int, primary key(a, b));", path1, path2, path3);
final Output out = KryoSerializationTests.serialize(localDataLayer);
final LocalDataLayer deserialized = KryoSerializationTests.deserialize(out, LocalDataLayer.class);
assertNotNull(deserialized);
assertEquals(localDataLayer.version(), deserialized.version());
assertEquals(localDataLayer, deserialized);
}
Aggregations