use of org.apache.spark.util.sketch.BloomFilter in project beneficiary-fhir-data by CMSgov.
the class LoadedFilterManager method buildFilter.
/**
* Build a filter for this loaded file. Should be a pure function.
*
* @param fileId to build a filter for
* @param firstUpdated time stamp
* @param fetchById a function which returns a list of batches
* @return a new filter
*/
public static LoadedFileFilter buildFilter(long fileId, Instant firstUpdated, Function<Long, List<LoadedBatch>> fetchById) {
final List<LoadedBatch> loadedBatches = fetchById.apply(fileId);
final int batchCount = loadedBatches.size();
if (batchCount == 0) {
throw new IllegalArgumentException("Batches cannot be empty for a filter");
}
final int batchSize = (loadedBatches.get(0).getBeneficiaries().length() + BENE_ID_SIZE) / BENE_ID_SIZE;
// It is important to get a good estimate of the number of entries for
// an accurate FFP and minimal memory size. This one assumes that all batches are of equal size.
final BloomFilter bloomFilter = LoadedFileFilter.createFilter(batchSize * batchCount);
// Loop through all batches, filling the bloom filter and finding the lastUpdated
Instant lastUpdated = firstUpdated;
for (LoadedBatch batch : loadedBatches) {
for (String beneficiary : batch.getBeneficiariesAsList()) {
bloomFilter.putString(beneficiary);
}
if (batch.getCreated().isAfter(lastUpdated)) {
lastUpdated = batch.getCreated();
}
}
LOGGER.info("Built a filter for {} with {} batches", fileId, loadedBatches.size());
return new LoadedFileFilter(fileId, batchCount, firstUpdated, lastUpdated, bloomFilter);
}
use of org.apache.spark.util.sketch.BloomFilter in project beneficiary-fhir-data by CMSgov.
the class LoadedFilterTest method testMightContain.
@Test
public void testMightContain() {
// Very small test on the Guava implementation of BloomFilters. Assume this package works.
final BloomFilter smallFilter = LoadedFileFilter.createFilter(10);
smallFilter.putString("1");
smallFilter.putString("100");
smallFilter.putString("100");
final LoadedFileFilter filter1 = new LoadedFileFilter(1, 1, Instant.now().minusSeconds(10), Instant.now().minusSeconds(5), smallFilter);
assertTrue(filter1.mightContain("1"));
assertFalse(filter1.mightContain("888"));
assertFalse(filter1.mightContain("BAD"));
}
use of org.apache.spark.util.sketch.BloomFilter in project beneficiary-fhir-data by CMSgov.
the class LoadedFilterTest method testMatchesDateRange.
@Test
public void testMatchesDateRange() {
final BloomFilter emptyFilter = LoadedFileFilter.createFilter(10);
final LoadedFileFilter filter1 = new LoadedFileFilter(1, 0, Instant.now().minusSeconds(10), Instant.now().minusSeconds(5), emptyFilter);
assertTrue(filter1.matchesDateRange(null), "Expected null range to be treated as an infinite range");
assertTrue(filter1.matchesDateRange(new DateRangeParam()), "Expected empty range to be treated as an infinite range");
final DateRangeParam sinceYesterday = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.GREATERTHAN).setValue(Date.from(Instant.now().minus(1, ChronoUnit.DAYS))));
assertTrue(filter1.matchesDateRange(sinceYesterday), "Expected since yesterday period to cover");
final DateRangeParam beforeNow = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.LESSTHAN_OR_EQUALS).setValue(new Date()));
assertTrue(filter1.matchesDateRange(beforeNow), "Expected since yesterday period to cover");
final DateRangeParam beforeYesterday = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.LESSTHAN).setValue(Date.from(Instant.now().minus(1, ChronoUnit.DAYS))));
assertFalse(filter1.matchesDateRange(beforeYesterday), "Expected before yesterday period to not match");
final DateRangeParam afterNow = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.GREATERTHAN_OR_EQUALS).setValue(new Date()));
assertFalse(filter1.matchesDateRange(afterNow), "Expected after now period to not match");
final DateRangeParam beforeSevenSeconds = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.LESSTHAN).setValue(Date.from(Instant.now().minus(7, ChronoUnit.SECONDS))));
assertTrue(filter1.matchesDateRange(beforeSevenSeconds), "Expected partial match to match");
final DateRangeParam afterSevenSeconds = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.GREATERTHAN).setValue(Date.from(Instant.now().minus(7, ChronoUnit.SECONDS))));
assertTrue(filter1.matchesDateRange(afterSevenSeconds), "Expected partial match to match");
final DateRangeParam sevenSeconds = new DateRangeParam(Date.from(Instant.now().minusSeconds(8)), Date.from(Instant.now().minusSeconds(7)));
assertTrue(filter1.matchesDateRange(sevenSeconds), "Expected partial match to match");
}
Aggregations