use of org.apache.hadoop.util.bloom.Key in project compiler by boalang.
the class DistinctAggregator method aggregate.
/** {@inheritDoc} */
@Override
public void aggregate(final String data, final String metadata) throws IOException, InterruptedException {
// instantiate a bloom filter input key initialized by the data
Key key = new Key(data.getBytes());
// if the key is already in the filter, forget it
if (this.filter.membershipTest(key))
return;
// add the key to the bloom filter
this.filter.add(key);
// and collect it
this.collect(data);
}
use of org.apache.hadoop.util.bloom.Key in project compiler by boalang.
the class UniqueAggregator method aggregate.
/** {@inheritDoc} */
@Override
public void aggregate(final String data, final String metadata) throws IOException, InterruptedException {
// instantiate a bloom filter input key initialized by the data
final Key key = new Key(data.getBytes());
// if the key is already in the filter, forget about it
if (this.filter.membershipTest(key))
return;
// add the key to the bloom filter
this.filter.add(key);
if (this.isCombining())
this.collect(data);
else
this.total++;
}
use of org.apache.hadoop.util.bloom.Key in project accumulo by apache.
the class RowFunctor method transform.
@Override
public Key transform(org.apache.accumulo.core.data.Key acuKey) {
byte[] keyData;
ByteSequence row = acuKey.getRowData();
keyData = new byte[row.length()];
System.arraycopy(row.getBackingArray(), 0, keyData, 0, row.length());
return new Key(keyData, 1.0);
}
use of org.apache.hadoop.util.bloom.Key in project Gaffer by gchq.
the class AccumuloIDBetweenSetsRetrieverTest method shouldDealWithFalsePositives.
private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException {
final Set<EntityId> seeds = new HashSet<>();
seeds.add(AccumuloTestData.SEED_A0);
seeds.add(AccumuloTestData.SEED_A23);
// positive sensible.
for (int i = 0; i < 10; i++) {
seeds.add(new EntitySeed("abc" + i));
}
// Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
// one that GraphElementsWithStatisticsWithinSetRetriever creates.
final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
if (!loadIntoMemory) {
store.getProperties().setMaxEntriesForBatchScanner("20");
}
// Find something that will give a false positive
// Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
// Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
// maxBloomFilterToPassToAnIterator bytes.
int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / Math.pow(Math.log(2.0), 2.0));
size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());
// Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
// (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
// Create Bloom filter and add seeds to it
final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
for (final EntityId seed : seeds) {
filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
}
// Test random items against it - should only have to shouldRetrieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
// false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
int count = 0;
int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
while (count < maxNumberOfTries) {
count++;
if (filter.membershipTest(new Key(("" + count).getBytes()))) {
break;
}
}
if (count == maxNumberOfTries) {
fail("Didn't find a false positive");
}
// False positive is "" + count so create an edge from seeds to that
final Edge edge = new Edge.Builder().group(TestGroups.EDGE).source("A0").dest("" + count).directed(true).build();
edge.putProperty(AccumuloPropertyNames.COUNT, 1000000);
Set<Element> data = new HashSet<>();
data.add(edge);
final User user = new User();
addElements(data, store, user);
// Now query for all edges in set - shouldn't get the false positive
GetElementsBetweenSets op = new GetElementsBetweenSets.Builder().input(AccumuloTestData.SEED_A0_SET).inputB(seeds).view(defaultView).build();
final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
// Check results are as expected
assertThat(results).hasSize(2).contains(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY);
}
use of org.apache.hadoop.util.bloom.Key in project Gaffer by gchq.
the class FilterWritabilityTest method shouldAcceptValidFilter.
@Test
public void shouldAcceptValidFilter() {
// Given
final BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
filter.add(new Key("ABC".getBytes()));
filter.add(new Key("DEF".getBytes()));
// Then
assertTrue(filter.membershipTest(new Key("ABC".getBytes())));
assertTrue(filter.membershipTest(new Key("DEF".getBytes())));
assertFalse(filter.membershipTest(new Key("lkjhgfdsa".getBytes())));
}
Aggregations