Search in sources :

Example 1 with BloomFilter

use of org.apache.spark.util.sketch.BloomFilter in project beneficiary-fhir-data by CMSgov.

the class LoadedFilterManager method buildFilter.

/**
 * Build a filter for this loaded file. Should be a pure function.
 *
 * @param fileId to build a filter for
 * @param firstUpdated time stamp
 * @param fetchById a function which returns a list of batches
 * @return a new filter
 */
public static LoadedFileFilter buildFilter(long fileId, Instant firstUpdated, Function<Long, List<LoadedBatch>> fetchById) {
    final List<LoadedBatch> loadedBatches = fetchById.apply(fileId);
    final int batchCount = loadedBatches.size();
    if (batchCount == 0) {
        throw new IllegalArgumentException("Batches cannot be empty for a filter");
    }
    final int batchSize = (loadedBatches.get(0).getBeneficiaries().length() + BENE_ID_SIZE) / BENE_ID_SIZE;
    // It is important to get a good estimate of the number of entries for
    // an accurate FFP and minimal memory size. This one assumes that all batches are of equal size.
    final BloomFilter bloomFilter = LoadedFileFilter.createFilter(batchSize * batchCount);
    // Loop through all batches, filling the bloom filter and finding the lastUpdated
    Instant lastUpdated = firstUpdated;
    for (LoadedBatch batch : loadedBatches) {
        for (String beneficiary : batch.getBeneficiariesAsList()) {
            bloomFilter.putString(beneficiary);
        }
        if (batch.getCreated().isAfter(lastUpdated)) {
            lastUpdated = batch.getCreated();
        }
    }
    LOGGER.info("Built a filter for {} with {} batches", fileId, loadedBatches.size());
    return new LoadedFileFilter(fileId, batchCount, firstUpdated, lastUpdated, bloomFilter);
}
Also used : Instant(java.time.Instant) BloomFilter(org.apache.spark.util.sketch.BloomFilter) LoadedBatch(gov.cms.bfd.model.rif.LoadedBatch)

Example 2 with BloomFilter

use of org.apache.spark.util.sketch.BloomFilter in project beneficiary-fhir-data by CMSgov.

the class LoadedFilterTest method testMightContain.

@Test
public void testMightContain() {
    // Very small test on the Guava implementation of BloomFilters. Assume this package works.
    final BloomFilter smallFilter = LoadedFileFilter.createFilter(10);
    smallFilter.putString("1");
    smallFilter.putString("100");
    smallFilter.putString("100");
    final LoadedFileFilter filter1 = new LoadedFileFilter(1, 1, Instant.now().minusSeconds(10), Instant.now().minusSeconds(5), smallFilter);
    assertTrue(filter1.mightContain("1"));
    assertFalse(filter1.mightContain("888"));
    assertFalse(filter1.mightContain("BAD"));
}
Also used : LoadedFileFilter(gov.cms.bfd.server.war.commons.LoadedFileFilter) BloomFilter(org.apache.spark.util.sketch.BloomFilter) Test(org.junit.jupiter.api.Test)

Example 3 with BloomFilter

use of org.apache.spark.util.sketch.BloomFilter in project beneficiary-fhir-data by CMSgov.

the class LoadedFilterTest method testMatchesDateRange.

@Test
public void testMatchesDateRange() {
    final BloomFilter emptyFilter = LoadedFileFilter.createFilter(10);
    final LoadedFileFilter filter1 = new LoadedFileFilter(1, 0, Instant.now().minusSeconds(10), Instant.now().minusSeconds(5), emptyFilter);
    assertTrue(filter1.matchesDateRange(null), "Expected null range to be treated as an infinite range");
    assertTrue(filter1.matchesDateRange(new DateRangeParam()), "Expected empty range to be treated as an infinite range");
    final DateRangeParam sinceYesterday = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.GREATERTHAN).setValue(Date.from(Instant.now().minus(1, ChronoUnit.DAYS))));
    assertTrue(filter1.matchesDateRange(sinceYesterday), "Expected since yesterday period to cover");
    final DateRangeParam beforeNow = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.LESSTHAN_OR_EQUALS).setValue(new Date()));
    assertTrue(filter1.matchesDateRange(beforeNow), "Expected since yesterday period to cover");
    final DateRangeParam beforeYesterday = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.LESSTHAN).setValue(Date.from(Instant.now().minus(1, ChronoUnit.DAYS))));
    assertFalse(filter1.matchesDateRange(beforeYesterday), "Expected before yesterday period to not match");
    final DateRangeParam afterNow = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.GREATERTHAN_OR_EQUALS).setValue(new Date()));
    assertFalse(filter1.matchesDateRange(afterNow), "Expected after now period to not match");
    final DateRangeParam beforeSevenSeconds = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.LESSTHAN).setValue(Date.from(Instant.now().minus(7, ChronoUnit.SECONDS))));
    assertTrue(filter1.matchesDateRange(beforeSevenSeconds), "Expected partial match to match");
    final DateRangeParam afterSevenSeconds = new DateRangeParam(new DateParam().setPrefix(ParamPrefixEnum.GREATERTHAN).setValue(Date.from(Instant.now().minus(7, ChronoUnit.SECONDS))));
    assertTrue(filter1.matchesDateRange(afterSevenSeconds), "Expected partial match to match");
    final DateRangeParam sevenSeconds = new DateRangeParam(Date.from(Instant.now().minusSeconds(8)), Date.from(Instant.now().minusSeconds(7)));
    assertTrue(filter1.matchesDateRange(sevenSeconds), "Expected partial match to match");
}
Also used : DateRangeParam(ca.uhn.fhir.rest.param.DateRangeParam) LoadedFileFilter(gov.cms.bfd.server.war.commons.LoadedFileFilter) BloomFilter(org.apache.spark.util.sketch.BloomFilter) Date(java.util.Date) DateParam(ca.uhn.fhir.rest.param.DateParam) Test(org.junit.jupiter.api.Test)

Aggregations

BloomFilter (org.apache.spark.util.sketch.BloomFilter)3 LoadedFileFilter (gov.cms.bfd.server.war.commons.LoadedFileFilter)2 Test (org.junit.jupiter.api.Test)2 DateParam (ca.uhn.fhir.rest.param.DateParam)1 DateRangeParam (ca.uhn.fhir.rest.param.DateRangeParam)1 LoadedBatch (gov.cms.bfd.model.rif.LoadedBatch)1 Instant (java.time.Instant)1 Date (java.util.Date)1