use of org.apache.cassandra.spark.sparksql.filters.CustomFilter in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testPartialFilterMatch.
@Test
public void testPartialFilterMatch() {
runTest((partitioner, dir, bridge) -> {
// write an SSTable
final TestSchema schema = TestSchema.basic(bridge);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < NUM_ROWS; i++) {
for (int j = 0; j < NUM_COLS; j++) {
writer.write(i, j, i + j);
}
}
});
assertEquals(1, countSSTables(dir));
final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
final ByteBuffer key1 = Int32Type.instance.fromString("0");
final BigInteger token1 = bridge.hash(partitioner, key1);
final PartitionKeyFilter keyInSSTable = PartitionKeyFilter.create(key1, token1);
final SparkRangeFilter rangeFilter = SparkRangeFilter.create(Range.closed(token1, token1));
final ByteBuffer key2 = Int32Type.instance.fromString("55");
final BigInteger token2 = bridge.hash(partitioner, key2);
final PartitionKeyFilter keyNotInSSTable = PartitionKeyFilter.create(key2, token2);
final List<CustomFilter> filters = Arrays.asList(rangeFilter, keyInSSTable, keyNotInSSTable);
final AtomicBoolean pass = new AtomicBoolean(true);
final AtomicInteger skipCount = new AtomicInteger(0);
final Stats stats = new Stats() {
@Override
public void skippedPartition(ByteBuffer key, BigInteger token) {
LOGGER.info("Skipping partition: " + token);
skipCount.incrementAndGet();
if (filters.stream().anyMatch(filter -> !filter.skipPartition(key, token))) {
LOGGER.info("Should not skip partition: " + token);
pass.set(false);
}
}
};
final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, false, stats);
final int rows = countAndValidateRows(reader);
assertTrue(skipCount.get() > 0);
assertEquals(NUM_COLS, rows);
// should skip partitions not matching filters
assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, rows);
assertTrue(pass.get());
});
}
use of org.apache.cassandra.spark.sparksql.filters.CustomFilter in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testExtractRangePartitionKeyFilters.
@Test
public void testExtractRangePartitionKeyFilters() {
final List<ByteBuffer> keys = new ArrayList<>();
for (int i = 0; i < 1000; i++) {
keys.add((ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip());
}
final List<PartitionKeyFilter> partitionKeyFilters = keys.stream().map(b -> {
final BigInteger token = FourZeroUtils.tokenToBigInteger(Murmur3Partitioner.instance.getToken(b).getToken());
return PartitionKeyFilter.create(b, token);
}).collect(Collectors.toList());
final List<CustomFilter> filters = new ArrayList<>(partitionKeyFilters.size() + 1);
final Range<BigInteger> sparkRange = Range.closed(new BigInteger("0"), new BigInteger("2305843009213693952"));
filters.add(SparkRangeFilter.create(sparkRange));
filters.addAll(partitionKeyFilters.stream().filter(t -> sparkRange.contains(t.token())).collect(Collectors.toList()));
assertTrue(filters.size() > 1);
final Optional<Range<BigInteger>> range = FourZeroSSTableReader.extractRange(filters);
assertTrue(range.isPresent());
assertNotEquals(sparkRange, range.get());
assertTrue(sparkRange.lowerEndpoint().compareTo(range.get().lowerEndpoint()) < 0);
assertTrue(sparkRange.upperEndpoint().compareTo(range.get().upperEndpoint()) > 0);
}
use of org.apache.cassandra.spark.sparksql.filters.CustomFilter in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testFilterKeyMissingInIndex.
@Test
public void testFilterKeyMissingInIndex() {
runTest((partitioner, dir, bridge) -> {
// write an SSTable
final TestSchema schema = TestSchema.basic(bridge);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < NUM_ROWS; i++) {
for (int j = 0; j < NUM_COLS; j++) {
writer.write(i, j, i + j);
}
}
});
assertEquals(1, countSSTables(dir));
final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
final ByteBuffer key1 = Int32Type.instance.fromString("51");
final BigInteger token1 = bridge.hash(partitioner, key1);
final PartitionKeyFilter keyNotInSSTable1 = PartitionKeyFilter.create(key1, token1);
final ByteBuffer key2 = Int32Type.instance.fromString("90");
final BigInteger token2 = bridge.hash(partitioner, key2);
final PartitionKeyFilter keyNotInSSTable2 = PartitionKeyFilter.create(key2, token2);
final List<CustomFilter> filters = Arrays.asList(keyNotInSSTable1, keyNotInSSTable2);
final AtomicBoolean pass = new AtomicBoolean(true);
final AtomicInteger skipCount = new AtomicInteger(0);
final Stats stats = new Stats() {
@Override
public void skippedSSTable(List<CustomFilter> filters, BigInteger firstToken, BigInteger lastToken) {
pass.set(false);
}
@Override
public void missingInIndex() {
skipCount.incrementAndGet();
if (filters.size() != 2) {
pass.set(false);
}
}
};
final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, true, stats);
assertTrue(reader.ignore());
assertEquals(1, skipCount.get());
assertTrue(pass.get());
});
}
use of org.apache.cassandra.spark.sparksql.filters.CustomFilter in project spark-cassandra-bulkreader by jberragan.
the class FourZeroUtils method readPrimaryIndex.
/**
* Read primary Index.db file, read through all partitions to get first and last partition key
*
* @param primaryIndex input stream for Index.db file
* @return pair of first and last decorated keys
* @throws IOException
*/
@SuppressWarnings("InfiniteLoopStatement")
static Pair<ByteBuffer, ByteBuffer> readPrimaryIndex(@NotNull final InputStream primaryIndex, final boolean readFirstLastKey, @NotNull final List<CustomFilter> filters) throws IOException {
ByteBuffer firstKey = null, lastKey = null;
try (final DataInputStream dis = new DataInputStream(primaryIndex)) {
byte[] last = null;
try {
while (true) {
final int len = dis.readUnsignedShort();
final byte[] buf = new byte[len];
dis.readFully(buf);
if (firstKey == null) {
firstKey = ByteBuffer.wrap(buf);
}
last = buf;
final ByteBuffer key = ByteBuffer.wrap(last);
if (!readFirstLastKey && filters.stream().anyMatch(filter -> filter.canFilterByKey() && filter.filter(key))) {
return Pair.of(null, null);
}
// read position & skip promoted index
skipRowIndexEntry(dis);
}
} catch (final EOFException ignored) {
}
if (last != null) {
lastKey = ByteBuffer.wrap(last);
}
}
return Pair.of(firstKey, lastKey);
}
use of org.apache.cassandra.spark.sparksql.filters.CustomFilter in project spark-cassandra-bulkreader by jberragan.
the class FourZeroUtils method filterKeyInBloomFilter.
static List<CustomFilter> filterKeyInBloomFilter(@NotNull final DataLayer.SSTable ssTable, @NotNull final IPartitioner partitioner, final Descriptor descriptor, @NotNull final List<CustomFilter> filters) throws IOException {
try {
final BloomFilter bloomFilter = SSTableCache.INSTANCE.bloomFilter(ssTable, descriptor);
final Function<CustomFilter, Boolean> canApplyMatch = CustomFilter::canFilterByKey;
final Function<PartitionKeyFilter, Boolean> isKeyPresent = filter -> bloomFilter.isPresent(partitioner.decorateKey(filter.key()));
return filters.stream().filter(filter -> filter.matchFound(canApplyMatch, isKeyPresent)).collect(Collectors.toList());
} catch (Exception e) {
if (e instanceof FileNotFoundException) {
return filters;
}
throw e;
}
}
Aggregations