Search in sources :

Example 1 with BloomFilter

use of org.apache.hadoop.util.bloom.BloomFilter in project Gaffer by gchq.

the class CoreKeyBloomFilterIterator method validateOptions.

@Override
public boolean validateOptions(final Map<String, String> options) {
    if (!super.validateOptions(options)) {
        return false;
    }
    if (!options.containsKey(AccumuloStoreConstants.BLOOM_FILTER)) {
        throw new BloomFilterIteratorException("Must set the " + AccumuloStoreConstants.BLOOM_FILTER + " option");
    }
    filter = new BloomFilter();
    final byte[] bytes;
    try {
        bytes = options.get(AccumuloStoreConstants.BLOOM_FILTER).getBytes(AccumuloStoreConstants.BLOOM_FILTER_CHARSET);
    } catch (UnsupportedEncodingException e) {
        throw new BloomFilterIteratorException("Failed to re-create serialised bloom filter", e);
    }
    try (final InputStream inStream = new ByteArrayInputStream(bytes);
        final DataInputStream dataStream = new DataInputStream(inStream)) {
        filter.readFields(dataStream);
    } catch (final IOException e) {
        throw new BloomFilterIteratorException("Failed to re-create serialised bloom filter", e);
    }
    return true;
}
Also used : BloomFilterIteratorException(uk.gov.gchq.gaffer.accumulostore.key.exception.BloomFilterIteratorException) ByteArrayInputStream(java.io.ByteArrayInputStream) DataInputStream(java.io.DataInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) BloomFilter(org.apache.hadoop.util.bloom.BloomFilter)

Example 2 with BloomFilter

use of org.apache.hadoop.util.bloom.BloomFilter in project Gaffer by gchq.

the class AccumuloIDBetweenSetsRetrieverTest method shouldDealWithFalsePositives.

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException {
    final Set<EntityId> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }
    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }
    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / Math.pow(Math.log(2.0), 2.0));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());
    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntityId seed : seeds) {
        filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }
    // Test random items against it - should only have to shouldRetrieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }
    // False positive is "" + count so create an edge from seeds to that
    final Edge edge = new Edge.Builder().group(TestGroups.EDGE).source("A0").dest("" + count).directed(true).build();
    edge.putProperty(AccumuloPropertyNames.COUNT, 1000000);
    Set<Element> data = new HashSet<>();
    data.add(edge);
    final User user = new User();
    addElements(data, store, user);
    // Now query for all edges in set - shouldn't get the false positive
    GetElementsBetweenSets op = new GetElementsBetweenSets.Builder().input(AccumuloTestData.SEED_A0_SET).inputB(seeds).view(defaultView).build();
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
    // Check results are as expected
    assertThat(results).hasSize(2).contains(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY);
}
Also used : User(uk.gov.gchq.gaffer.user.User) GetElementsBetweenSets(uk.gov.gchq.gaffer.accumulostore.operation.impl.GetElementsBetweenSets) Element(uk.gov.gchq.gaffer.data.element.Element) BloomFilter(org.apache.hadoop.util.bloom.BloomFilter) EntityId(uk.gov.gchq.gaffer.data.element.id.EntityId) EntitySeed(uk.gov.gchq.gaffer.operation.data.EntitySeed) Edge(uk.gov.gchq.gaffer.data.element.Edge) Key(org.apache.hadoop.util.bloom.Key) HashSet(java.util.HashSet)

Example 3 with BloomFilter

use of org.apache.hadoop.util.bloom.BloomFilter in project common-crawl by matpalm.

the class ReadNgram method main.

public static void main(String[] s) throws IOException {
    Configuration conf = new Configuration();
    String filename = "bfngrams/out/part-00000";
    FileSystem fs = FileSystem.get(URI.create(filename), conf);
    Path path = new Path(filename);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    NullWritable nullKey = NullWritable.get();
    BloomFilter bloomFilter = new BloomFilter();
    reader.next(nullKey, bloomFilter);
    reader.close();
    System.out.println(bloomFilter.toString());
    String[] egs = { "activities other", "membership organizations", "organizations elsewhere", "4 0", "elsewhere classified", "other membership", "0 activities", "20091128093155 4" };
    for (String eg : egs) {
        Key k = new Key(eg.getBytes());
        System.out.println(eg + "\t" + bloomFilter.membershipTest(k));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) NullWritable(org.apache.hadoop.io.NullWritable) BloomFilter(org.apache.hadoop.util.bloom.BloomFilter) Key(org.apache.hadoop.util.bloom.Key)

Example 4 with BloomFilter

use of org.apache.hadoop.util.bloom.BloomFilter in project Gaffer by gchq.

the class AccumuloIDWithinSetRetrieverTest method shouldDealWithFalsePositives.

private void shouldDealWithFalsePositives(final boolean loadIntoMemory, final AccumuloStore store) throws StoreException {
    // Query for all edges in set {A0, A23}
    final Set<EntityId> seeds = new HashSet<>();
    seeds.add(AccumuloTestData.SEED_A0);
    seeds.add(AccumuloTestData.SEED_A23);
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new EntitySeed("abc" + i));
    }
    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    final int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        store.getProperties().setMaxEntriesForBatchScanner("20");
    }
    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / Math.pow(Math.log(2.0), 2.0));
    size = Math.min(size, store.getProperties().getMaxBloomFilterToPassToAnIterator());
    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    final int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    final BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (final EntityId seed : seeds) {
        filter.add(new Key(store.getKeyPackage().getKeyConverter().serialiseVertex(seed.getVertex())));
    }
    // Test random items against it - should only have to shouldRetrieveElementsInRangeBetweenSeeds MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * store.getProperties().getMaxBloomFilterToPassToAnIterator();
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }
    // False positive is "" + count so create an edge from seeds to that
    final GetElementsWithinSet op = new GetElementsWithinSet.Builder().view(defaultView).input(seeds).build();
    // Now query for all edges in set - shouldn't get the false positive
    final Set<Element> results = returnElementsFromOperation(store, op, new User(), loadIntoMemory);
    // Check results are as expected
    assertThat(results).contains(AccumuloTestData.EDGE_A0_A23, AccumuloTestData.A0_ENTITY, AccumuloTestData.A23_ENTITY);
}
Also used : EntityId(uk.gov.gchq.gaffer.data.element.id.EntityId) User(uk.gov.gchq.gaffer.user.User) Element(uk.gov.gchq.gaffer.data.element.Element) EntitySeed(uk.gov.gchq.gaffer.operation.data.EntitySeed) GetElementsWithinSet(uk.gov.gchq.gaffer.accumulostore.operation.impl.GetElementsWithinSet) BloomFilter(org.apache.hadoop.util.bloom.BloomFilter) Key(org.apache.hadoop.util.bloom.Key) HashSet(java.util.HashSet)

Example 5 with BloomFilter

use of org.apache.hadoop.util.bloom.BloomFilter in project Gaffer by gchq.

the class CoreKeyBloomFilterIterator method init.

@Override
public void init(final SortedKeyValueIterator<Key, Value> source, final Map<String, String> options, final IteratorEnvironment env) throws IOException {
    super.init(source, options, env);
    filter = new BloomFilter();
    final byte[] bytes;
    try {
        bytes = options.get(AccumuloStoreConstants.BLOOM_FILTER).getBytes(AccumuloStoreConstants.BLOOM_FILTER_CHARSET);
    } catch (final UnsupportedEncodingException e) {
        throw new BloomFilterIteratorException("Failed to re-create serialised bloom filter", e);
    }
    try (final InputStream inStream = new ByteArrayInputStream(bytes);
        final DataInputStream dataStream = new DataInputStream(inStream)) {
        filter.readFields(dataStream);
    } catch (final IOException e) {
        throw new BloomFilterIteratorException("Failed to re-create serialised bloom filter", e);
    }
    LOGGER.debug("Initialised CoreKeyBloomFilterIterator");
}
Also used : BloomFilterIteratorException(uk.gov.gchq.gaffer.accumulostore.key.exception.BloomFilterIteratorException) ByteArrayInputStream(java.io.ByteArrayInputStream) DataInputStream(java.io.DataInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) BloomFilter(org.apache.hadoop.util.bloom.BloomFilter)

Aggregations

BloomFilter (org.apache.hadoop.util.bloom.BloomFilter)5 Key (org.apache.hadoop.util.bloom.Key)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 DataInputStream (java.io.DataInputStream)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 HashSet (java.util.HashSet)2 BloomFilterIteratorException (uk.gov.gchq.gaffer.accumulostore.key.exception.BloomFilterIteratorException)2 Element (uk.gov.gchq.gaffer.data.element.Element)2 EntityId (uk.gov.gchq.gaffer.data.element.id.EntityId)2 EntitySeed (uk.gov.gchq.gaffer.operation.data.EntitySeed)2 User (uk.gov.gchq.gaffer.user.User)2 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 NullWritable (org.apache.hadoop.io.NullWritable)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 GetElementsBetweenSets (uk.gov.gchq.gaffer.accumulostore.operation.impl.GetElementsBetweenSets)1 GetElementsWithinSet (uk.gov.gchq.gaffer.accumulostore.operation.impl.GetElementsWithinSet)1