use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.
the class IndexDbTests method testSearchIndex.
@Test
public void testSearchIndex() {
runTest((partitioner, dir, bridge) -> {
final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(false).build();
final IPartitioner iPartitioner = FourZero.getPartitioner(partitioner);
final int numRows = 5000;
// write an sstable and record token
final List<BigInteger> tokens = new ArrayList<>(numRows);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < numRows; i++) {
final ByteBuffer key = (ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip();
final BigInteger token = FourZeroUtils.tokenToBigInteger(iPartitioner.decorateKey(key).getToken());
tokens.add(token);
writer.write(i, 0, i);
}
});
assertEquals(1, countSSTables(dir));
Collections.sort(tokens);
final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
if (metadata == null) {
throw new NullPointerException("Could not find table");
}
final Path summaryDb = TestUtils.getFirstFileType(dir, DataLayer.FileType.SUMMARY);
assertNotNull(summaryDb);
final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
final int rowSize = 39;
final int sample = 4;
// sample the token list and read offset in Index.db for sampled list & verify the offset matches the expected
// we sample the list as IndexDbUtils.findStartOffset(...) returns the previous offset, so we want to test
// it correctly skips tokens less than the token we are looking for before returning.
final List<BigInteger> sparseList = IntStream.range(0, tokens.size()).filter(i -> i > 0 && i % sample == 0).mapToObj(tokens::get).collect(Collectors.toList());
assertEquals((numRows / 4) - 1, sparseList.size());
try (final DataInputStream in = new DataInputStream(Objects.requireNonNull(ssTable.openPrimaryIndexStream()))) {
try {
for (int idx = 0; idx < sparseList.size(); idx++) {
final BigInteger token = sparseList.get(idx);
final long expectedOffset = (((idx + 1L) * sample) - 1) * rowSize;
final long offset = IndexDbUtils.findStartOffset(in, iPartitioner, Range.closed(token, token), Stats.DoNothingStats.INSTANCE);
assertEquals(expectedOffset, offset);
FourZeroUtils.skipRowIndexEntry(in);
}
} catch (final EOFException ignore) {
}
}
});
}
use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testIncrementalRepair.
// incremental repair
@Test
public void testIncrementalRepair() {
runTest((partitioner, dir, bridge) -> {
final TestSchema schema = TestSchema.basic(bridge);
final int numSSTables = 4;
final int numRepaired = 2;
final int numUnRepaired = numSSTables - numRepaired;
// write some SSTables
for (int a = 0; a < numSSTables; a++) {
final int pos = a * NUM_ROWS;
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = pos; i < pos + NUM_ROWS; i++) {
for (int j = 0; j < NUM_COLS; j++) {
writer.write(i, j, i + j);
}
}
});
}
assertEquals(numSSTables, countSSTables(dir));
final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
final TestDataLayer dataLayer = new TestDataLayer(bridge, getFileType(dir, DataLayer.FileType.DATA).collect(Collectors.toList()));
final AtomicInteger skipCount = new AtomicInteger(0);
final Stats stats = new Stats() {
@Override
public void skippedRepairedSSTable(DataLayer.SSTable ssTable, long repairedAt) {
skipCount.incrementAndGet();
}
};
// mark some SSTables as repaired
final Map<DataLayer.SSTable, Boolean> isRepaired = dataLayer.listSSTables().collect(Collectors.toMap(Function.identity(), a -> false));
int count = 0;
for (final DataLayer.SSTable ssTable : isRepaired.keySet()) {
if (count < numRepaired) {
isRepaired.put(ssTable, true);
count++;
}
}
final List<FourZeroSSTableReader> primaryReaders = dataLayer.listSSTables().map(ssTable -> openIncrementalReader(metaData, ssTable, stats, true, isRepaired.get(ssTable))).filter(reader -> !reader.ignore()).collect(Collectors.toList());
final List<FourZeroSSTableReader> nonPrimaryReaders = dataLayer.listSSTables().map(ssTable -> openIncrementalReader(metaData, ssTable, stats, false, isRepaired.get(ssTable))).filter(reader -> !reader.ignore()).collect(Collectors.toList());
// primary repair replica should read all sstables
assertEquals(numSSTables, primaryReaders.size());
// non-primary repair replica should only read unrepaired sstables
assertEquals(numUnRepaired, nonPrimaryReaders.size());
for (final FourZeroSSTableReader reader : nonPrimaryReaders) {
assertFalse(isRepaired.get(reader.sstable()));
}
assertEquals(numUnRepaired, skipCount.get());
final Set<FourZeroSSTableReader> toCompact = Stream.concat(primaryReaders.stream().filter(r -> isRepaired.get(r.sstable())), nonPrimaryReaders.stream()).collect(Collectors.toSet());
assertEquals(numSSTables, toCompact.size());
int rowCount = 0;
boolean[] found = new boolean[numSSTables * NUM_ROWS];
try (final CompactionStreamScanner scanner = new CompactionStreamScanner(metaData, partitioner, toCompact)) {
// iterate through CompactionScanner and verify we have all the partition keys we are looking for
final Rid rid = scanner.getRid();
while (scanner.hasNext()) {
scanner.next();
final int a = rid.getPartitionKey().asIntBuffer().get();
found[a] = true;
// extract clustering key value and column name
final ByteBuffer colBuf = rid.getColumnName();
final ByteBuffer clusteringKey = ByteBufUtils.readBytesWithShortLength(colBuf);
colBuf.get();
final String colName = ByteBufUtils.string(ByteBufUtils.readBytesWithShortLength(colBuf));
colBuf.get();
if (StringUtils.isEmpty(colName)) {
continue;
}
assertEquals("c", colName);
final int b = clusteringKey.asIntBuffer().get();
// extract value column
final int c = rid.getValue().asIntBuffer().get();
assertEquals(c, a + b);
rowCount++;
}
}
assertEquals(numSSTables * NUM_ROWS * NUM_COLS, rowCount);
for (final boolean b : found) {
assertTrue(b);
}
});
}
use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.
the class SparkRowIteratorTests method testRowIterator.
private static void testRowIterator(final CassandraBridge.CassandraVersion version, final TestSchema schema, final TestSchema.TestRow[] testRows) throws IOException {
final CassandraBridge bridge = CassandraBridge.get(version);
final CqlSchema cqlSchema = schema.buildSchema();
final int numRows = testRows.length;
final int numColumns = cqlSchema.fields().size() - cqlSchema.numPartitionKeys() - cqlSchema.numClusteringKeys();
final List<CqlField> columns = cqlSchema.fields().stream().filter(f -> !f.isPartitionKey()).filter(f -> !f.isClusteringColumn()).sorted().collect(Collectors.toList());
final Rid rid = new Rid();
final AtomicInteger rowPos = new AtomicInteger();
final AtomicInteger colPos = new AtomicInteger();
// mock data layer
final DataLayer dataLayer = mock(DataLayer.class);
when(dataLayer.cqlSchema()).thenReturn(cqlSchema);
when(dataLayer.version()).thenReturn(version);
when(dataLayer.isInPartition(any(BigInteger.class), any(ByteBuffer.class))).thenReturn(true);
when(dataLayer.bridge()).thenCallRealMethod();
when(dataLayer.stats()).thenReturn(Stats.DoNothingStats.INSTANCE);
when(dataLayer.requestedFeatures()).thenCallRealMethod();
// mock scanner
final IStreamScanner scanner = mock(IStreamScanner.class);
when(scanner.hasNext()).thenAnswer(invocation -> rowPos.get() < numRows);
when(scanner.getRid()).thenReturn(rid);
doAnswer(invocation -> {
final int col = colPos.getAndIncrement();
final TestSchema.TestRow testRow = testRows[rowPos.get()];
// write next partition key
if (col == 0) {
if (cqlSchema.numPartitionKeys() == 1) {
final CqlField partitionKey = cqlSchema.partitionKeys().get(0);
rid.setPartitionKeyCopy(partitionKey.serialize(testRow.get(partitionKey.pos())), BigInteger.ONE);
} else {
assert cqlSchema.numPartitionKeys() > 1;
final ByteBuffer[] partitionBuffers = new ByteBuffer[cqlSchema.numPartitionKeys()];
int pos = 0;
for (final CqlField partitionKey : cqlSchema.partitionKeys()) {
partitionBuffers[pos] = partitionKey.serialize(testRow.get(partitionKey.pos()));
pos++;
}
rid.setPartitionKeyCopy(ColumnTypes.build(false, partitionBuffers), BigInteger.ONE);
}
}
// write next clustering keys & column name
final CqlField column = columns.get(col);
final ByteBuffer[] colBuffers = new ByteBuffer[cqlSchema.numClusteringKeys() + 1];
int pos = 0;
for (final CqlField clusteringColumn : cqlSchema.clusteringKeys()) {
colBuffers[pos] = clusteringColumn.serialize(testRow.get(clusteringColumn.pos()));
pos++;
}
colBuffers[pos] = bridge.ascii().serialize(column.name());
rid.setColumnNameCopy(ColumnTypes.build(false, colBuffers));
// write value, timestamp and tombstone
rid.setValueCopy(column.serialize(testRow.get(column.pos())));
// move to next row
if (colPos.get() == numColumns) {
if (rowPos.getAndIncrement() >= numRows) {
throw new IllegalStateException("Went too far...");
}
// reset column position
colPos.set(0);
}
return null;
}).when(scanner).next();
when(dataLayer.openCompactionScanner(anyList(), any())).thenReturn(scanner);
// use SparkRowIterator and verify values match expected
final SparkRowIterator it = new SparkRowIterator(dataLayer);
int rowCount = 0;
while (it.next()) {
while (rowCount < testRows.length && testRows[rowCount].isTombstone()) // skip tombstones
{
rowCount++;
}
if (rowCount >= testRows.length) {
break;
}
final TestSchema.TestRow row = testRows[rowCount];
assertEquals(row, schema.toTestRow(it.get()));
rowCount++;
}
assertEquals(numRows, rowCount);
it.close();
}
use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.
the class SingleReplicaTests method runTest.
private static void runTest(final boolean shouldThrowIOException, final SSTablesSupplier.ReaderOpener<Reader> readerOpener, final Range<BigInteger> range, final DataLayer.FileType... missingFileTypes) throws InterruptedException, IOException, ExecutionException {
final PartitionedDataLayer dataLayer = mock(PartitionedDataLayer.class);
final CassandraInstance instance = new CassandraInstance("-9223372036854775808", "local1-i1", "DC1");
final DataLayer.SSTable ssTable1 = mockSSTable();
final DataLayer.SSTable ssTable2 = mockSSTable();
final DataLayer.SSTable ssTable3 = mockSSTable();
for (final DataLayer.FileType fileType : missingFileTypes) {
// verify() should throw IncompleteSSTableException when missing Statistic.db file
when(ssTable3.isMissing(eq(fileType))).thenReturn(true);
}
final Stream<DataLayer.SSTable> sstables = Stream.of(ssTable1, ssTable2, ssTable3);
when(dataLayer.listInstance(eq(0), eq(range), eq(instance))).thenReturn(CompletableFuture.completedFuture(sstables));
final SingleReplica replica = new SingleReplica(instance, dataLayer, range, 0, EXECUTOR, true);
final Set<Reader> readers;
try {
readers = replica.openReplicaAsync(readerOpener).get();
} catch (final ExecutionException e) {
// extract IOException and rethrow if wrapped in SSTableStreamException
final IOException io = SSTableStreamException.getIOException(e);
if (io != null) {
throw io;
}
throw e;
}
if (shouldThrowIOException) {
fail("Should throw IOException because an SSTable is corrupt");
}
assertEquals(3, readers.size());
}
use of org.apache.cassandra.spark.data.DataLayer in project spark-cassandra-bulkreader by jberragan.
the class IndexOffsetTests method test.
private static void test(final CassandraBridge bridge, final Path dir, final Partitioner partitioner, final boolean enableCompression) throws IOException {
final int numKeys = 500000;
final int sparkPartitions = 128;
final TestSchema schema = TestSchema.basicBuilder(bridge).withCompression(enableCompression).build();
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < numKeys; i++) {
writer.write(i, 0, i);
}
});
assertEquals(1, countSSTables(dir));
final TableMetadata metadata = Schema.instance.getTableMetadata(schema.keyspace, schema.table);
if (metadata == null) {
throw new NullPointerException("Could not find table");
}
final LocalDataLayer dataLayer = new LocalDataLayer(CassandraBridge.CassandraVersion.FOURZERO, partitioner, schema.keyspace, schema.createStmt, false, Collections.emptySet(), true, null, dir.toString());
final DataLayer.SSTable ssTable = dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find sstable"));
final Integer[] counts = IntStream.range(0, numKeys).map(i -> 0).boxed().toArray(Integer[]::new);
final CassandraRing ring = TestUtils.createRing(partitioner, 32);
// use TokenPartitioner to simulate Spark worker tokens partitions
final TokenPartitioner tokenPartitioner = new TokenPartitioner(ring, 1, sparkPartitions);
final List<Range<BigInteger>> ranges = tokenPartitioner.subRanges();
LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression);
final MutableInt skipped = new MutableInt(0);
for (Range<BigInteger> range : ranges) {
final FourZeroSSTableReader reader = FourZeroSSTableReader.builder(metadata, ssTable).withFilters(Collections.singletonList(SparkRangeFilter.create(range))).withStats(new Stats() {
public void skippedPartition(ByteBuffer key, BigInteger token) {
skipped.add(1);
}
}).build();
if (reader.ignore()) {
// we can skip this range entirely, it doesn't overlap with sstable
continue;
}
// each scanner should only read tokens within it's own token range
try (final ISSTableScanner scanner = reader.getScanner()) {
while (scanner.hasNext()) {
final UnfilteredRowIterator rowIterator = scanner.next();
final int key = rowIterator.partitionKey().getKey().getInt();
// count how many times we read a key across all 'spark' token partitions
counts[key] = counts[key] + 1;
while (rowIterator.hasNext()) {
rowIterator.next();
}
}
}
}
// verify we read each key exactly once across all Spark partitions
assertEquals(counts.length, numKeys);
int idx = 0;
for (Integer count : counts) {
if (count == 0) {
LOGGER.error("Missing key key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
} else if (count > 1) {
LOGGER.error("Key read by more than 1 Spark partition key={} token={} partitioner={}", idx, FourZeroUtils.tokenToBigInteger(FourZero.getPartitioner(partitioner).decorateKey((ByteBuffer) ByteBuffer.allocate(4).putInt(idx).flip()).getToken()), partitioner.name());
}
assertEquals(count == 0 ? "Key not found: " + idx : "Key " + idx + " read " + count + " times", 1, count.intValue());
idx++;
}
LOGGER.info("Success skippedKeys={} partitioner={}", skipped.intValue(), partitioner.name());
}
Aggregations