use of org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testPartialFilterMatch.
@Test
public void testPartialFilterMatch() {
runTest((partitioner, dir, bridge) -> {
// write an SSTable
final TestSchema schema = TestSchema.basic(bridge);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < NUM_ROWS; i++) {
for (int j = 0; j < NUM_COLS; j++) {
writer.write(i, j, i + j);
}
}
});
assertEquals(1, countSSTables(dir));
final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
final ByteBuffer key1 = Int32Type.instance.fromString("0");
final BigInteger token1 = bridge.hash(partitioner, key1);
final PartitionKeyFilter keyInSSTable = PartitionKeyFilter.create(key1, token1);
final SparkRangeFilter rangeFilter = SparkRangeFilter.create(Range.closed(token1, token1));
final ByteBuffer key2 = Int32Type.instance.fromString("55");
final BigInteger token2 = bridge.hash(partitioner, key2);
final PartitionKeyFilter keyNotInSSTable = PartitionKeyFilter.create(key2, token2);
final List<CustomFilter> filters = Arrays.asList(rangeFilter, keyInSSTable, keyNotInSSTable);
final AtomicBoolean pass = new AtomicBoolean(true);
final AtomicInteger skipCount = new AtomicInteger(0);
final Stats stats = new Stats() {
@Override
public void skippedPartition(ByteBuffer key, BigInteger token) {
LOGGER.info("Skipping partition: " + token);
skipCount.incrementAndGet();
if (filters.stream().anyMatch(filter -> !filter.skipPartition(key, token))) {
LOGGER.info("Should not skip partition: " + token);
pass.set(false);
}
}
};
final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, false, stats);
final int rows = countAndValidateRows(reader);
assertTrue(skipCount.get() > 0);
assertEquals(NUM_COLS, rows);
// should skip partitions not matching filters
assertEquals((NUM_ROWS - skipCount.get()) * NUM_COLS, rows);
assertTrue(pass.get());
});
}
use of org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testExtractRangePartitionKeyFilters.
@Test
public void testExtractRangePartitionKeyFilters() {
final List<ByteBuffer> keys = new ArrayList<>();
for (int i = 0; i < 1000; i++) {
keys.add((ByteBuffer) ByteBuffer.allocate(4).putInt(i).flip());
}
final List<PartitionKeyFilter> partitionKeyFilters = keys.stream().map(b -> {
final BigInteger token = FourZeroUtils.tokenToBigInteger(Murmur3Partitioner.instance.getToken(b).getToken());
return PartitionKeyFilter.create(b, token);
}).collect(Collectors.toList());
final List<CustomFilter> filters = new ArrayList<>(partitionKeyFilters.size() + 1);
final Range<BigInteger> sparkRange = Range.closed(new BigInteger("0"), new BigInteger("2305843009213693952"));
filters.add(SparkRangeFilter.create(sparkRange));
filters.addAll(partitionKeyFilters.stream().filter(t -> sparkRange.contains(t.token())).collect(Collectors.toList()));
assertTrue(filters.size() > 1);
final Optional<Range<BigInteger>> range = FourZeroSSTableReader.extractRange(filters);
assertTrue(range.isPresent());
assertNotEquals(sparkRange, range.get());
assertTrue(sparkRange.lowerEndpoint().compareTo(range.get().lowerEndpoint()) < 0);
assertTrue(sparkRange.upperEndpoint().compareTo(range.get().upperEndpoint()) > 0);
}
use of org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter in project spark-cassandra-bulkreader by jberragan.
the class SSTableReaderTests method testFilterKeyMissingInIndex.
@Test
public void testFilterKeyMissingInIndex() {
runTest((partitioner, dir, bridge) -> {
// write an SSTable
final TestSchema schema = TestSchema.basic(bridge);
TestUtils.writeSSTable(bridge, dir, partitioner, schema, (writer) -> {
for (int i = 0; i < NUM_ROWS; i++) {
for (int j = 0; j < NUM_COLS; j++) {
writer.write(i, j, i + j);
}
}
});
assertEquals(1, countSSTables(dir));
final Path dataFile = getFirstFileType(dir, DataLayer.FileType.DATA);
final TableMetadata metaData = new FourZeroSchemaBuilder(schema.createStmt, schema.keyspace, new ReplicationFactor(ReplicationFactor.ReplicationStrategy.SimpleStrategy, ImmutableMap.of("replication_factor", 1)), partitioner).tableMetaData();
final TestDataLayer dataLayer = new TestDataLayer(bridge, Collections.singletonList(dataFile));
final ByteBuffer key1 = Int32Type.instance.fromString("51");
final BigInteger token1 = bridge.hash(partitioner, key1);
final PartitionKeyFilter keyNotInSSTable1 = PartitionKeyFilter.create(key1, token1);
final ByteBuffer key2 = Int32Type.instance.fromString("90");
final BigInteger token2 = bridge.hash(partitioner, key2);
final PartitionKeyFilter keyNotInSSTable2 = PartitionKeyFilter.create(key2, token2);
final List<CustomFilter> filters = Arrays.asList(keyNotInSSTable1, keyNotInSSTable2);
final AtomicBoolean pass = new AtomicBoolean(true);
final AtomicInteger skipCount = new AtomicInteger(0);
final Stats stats = new Stats() {
@Override
public void skippedSSTable(List<CustomFilter> filters, BigInteger firstToken, BigInteger lastToken) {
pass.set(false);
}
@Override
public void missingInIndex() {
skipCount.incrementAndGet();
if (filters.size() != 2) {
pass.set(false);
}
}
};
final FourZeroSSTableReader reader = openReader(metaData, dataLayer.listSSTables().findFirst().orElseThrow(() -> new RuntimeException("Could not find SSTable")), filters, true, stats);
assertTrue(reader.ignore());
assertEquals(1, skipCount.get());
assertTrue(pass.get());
});
}
use of org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter in project spark-cassandra-bulkreader by jberragan.
the class FourZeroUtils method filterKeyInBloomFilter.
static List<CustomFilter> filterKeyInBloomFilter(@NotNull final DataLayer.SSTable ssTable, @NotNull final IPartitioner partitioner, final Descriptor descriptor, @NotNull final List<CustomFilter> filters) throws IOException {
try {
final BloomFilter bloomFilter = SSTableCache.INSTANCE.bloomFilter(ssTable, descriptor);
final Function<CustomFilter, Boolean> canApplyMatch = CustomFilter::canFilterByKey;
final Function<PartitionKeyFilter, Boolean> isKeyPresent = filter -> bloomFilter.isPresent(partitioner.decorateKey(filter.key()));
return filters.stream().filter(filter -> filter.matchFound(canApplyMatch, isKeyPresent)).collect(Collectors.toList());
} catch (Exception e) {
if (e instanceof FileNotFoundException) {
return filters;
}
throw e;
}
}
use of org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter in project spark-cassandra-bulkreader by jberragan.
the class PartitionKeyFilterTests method testValidFilter.
@Test
public void testValidFilter() {
final ByteBuffer key = Int32Type.instance.fromString("10");
final BigInteger token = BigInteger.valueOf((long) Murmur3Partitioner.instance.getToken(key).getTokenValue());
final PartitionKeyFilter filter = PartitionKeyFilter.create(key, token);
final ByteBuffer diffKey = Int32Type.instance.fromString("11");
final BigInteger diffToken = BigInteger.valueOf((long) Murmur3Partitioner.instance.getToken(diffKey).getTokenValue());
final Range<BigInteger> inRange = Range.closed(token, token);
final Range<BigInteger> notInRange = Range.closed(token.subtract(BigInteger.ONE), token.subtract(BigInteger.ONE));
final SparkSSTableReader reader = mock(SparkSSTableReader.class);
when(reader.range()).thenReturn(Range.closed(token, token));
assertTrue(filter.filter(key));
assertFalse(filter.filter(diffKey));
assertTrue(filter.overlaps(inRange));
assertFalse(filter.overlaps(notInRange));
assertFalse(filter.skipPartition(key, token));
assertTrue(filter.skipPartition(diffKey, diffToken));
assertTrue(filter.filter(reader));
final Function<CustomFilter, Boolean> canApply = testFilter -> testFilter instanceof PartitionKeyFilter;
final Function<CustomFilter, Boolean> cannotApply = testFilter -> testFilter instanceof SparkRangeFilter;
final Function<PartitionKeyFilter, Boolean> matchFunc = keyFilter -> Boolean.TRUE;
assertTrue(filter.matchFound(canApply, matchFunc));
assertFalse(filter.matchFound(cannotApply, matchFunc));
}
Aggregations