org.apache.hadoop.util.bloom.DynamicBloomFilter Java Examples
The following examples show how to use
org.apache.hadoop.util.bloom.DynamicBloomFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BloomMapFile.java From hadoop with Apache License 2.0 | 6 votes |
private void initBloomFilter(Path dirName, Configuration conf) { DataInputStream in = null; try { FileSystem fs = dirName.getFileSystem(conf); in = fs.open(new Path(dirName, BLOOM_FILE_NAME)); bloomFilter = new DynamicBloomFilter(); bloomFilter.readFields(in); in.close(); in = null; } catch (IOException ioe) { LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile."); bloomFilter = null; } finally { IOUtils.closeStream(in); } }
Example #2
Source File: BloomAndUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Nonnull private static DynamicBloomFilter createBloomFilter(long seed, int size) { DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000); final Key key = new Key(); final Random rnd1 = new Random(seed); for (int i = 0; i < size; i++) { double d = rnd1.nextGaussian(); String s = Double.toHexString(d); key.set(s.getBytes(), 1.0); dbf.add(key); } return dbf; }
Example #3
Source File: BloomAndUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Test public void test() throws IOException, HiveException { BloomAndUDF udf = new BloomAndUDF(); DynamicBloomFilter bf1 = createBloomFilter(1L, 10000); DynamicBloomFilter bf2 = createBloomFilter(2L, 10000); Text bf1str = BloomFilterUtils.serialize(bf1, new Text()); Text bf2str = BloomFilterUtils.serialize(bf2, new Text()); bf1.and(bf2); Text expected = BloomFilterUtils.serialize(bf1, new Text()); Text actual = udf.evaluate(bf1str, bf2str); Assert.assertEquals(expected, actual); DynamicBloomFilter deserialized = BloomFilterUtils.deserialize(actual, new DynamicBloomFilter()); assertNotContains(bf1, deserialized, 1L, 10000); assertNotContains(bf1, deserialized, 2L, 10000); }
Example #4
Source File: BloomOrUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Nonnull private static DynamicBloomFilter createBloomFilter(long seed, int size) { DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000); final Key key = new Key(); final Random rnd1 = new Random(seed); for (int i = 0; i < size; i++) { double d = rnd1.nextGaussian(); String s = Double.toHexString(d); key.set(s.getBytes(), 1.0); dbf.add(key); } return dbf; }
Example #5
Source File: BloomOrUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Test public void test() throws IOException, HiveException { BloomOrUDF udf = new BloomOrUDF(); DynamicBloomFilter bf1 = createBloomFilter(1L, 10000); DynamicBloomFilter bf2 = createBloomFilter(2L, 10000); Text bf1str = BloomFilterUtils.serialize(bf1, new Text()); Text bf2str = BloomFilterUtils.serialize(bf2, new Text()); bf1.or(bf2); Text expected = BloomFilterUtils.serialize(bf1, new Text()); Text actual = udf.evaluate(bf1str, bf2str); Assert.assertEquals(expected, actual); DynamicBloomFilter deserialized = BloomFilterUtils.deserialize(actual, new DynamicBloomFilter()); assertEquals(bf1, deserialized, 1L, 10000); assertEquals(bf1, deserialized, 2L, 10000); }
Example #6
Source File: BloomNotUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Nonnull private static DynamicBloomFilter createBloomFilter(long seed, int size) { DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000); final Key key = new Key(); final Random rnd1 = new Random(seed); for (int i = 0; i < size; i++) { double d = rnd1.nextGaussian(); String s = Double.toHexString(d); key.set(s.getBytes(), 1.0); dbf.add(key); } return dbf; }
Example #7
Source File: BloomContainsUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Nonnull private static DynamicBloomFilter createBloomFilter(long seed, int size) { DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(30); final Key key = new Key(); final Random rnd1 = new Random(seed); for (int i = 0; i < size; i++) { double d = rnd1.nextGaussian(); String s = Double.toHexString(d); Text t = new Text(s); key.set(t.copyBytes(), 1.0); dbf.add(key); } return dbf; }
Example #8
Source File: BloomContainsUDFTest.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Test public void testUDF() throws IOException, HiveException { BloomContainsUDF udf = new BloomContainsUDF(); final long seed = 43L; final int size = 100; DynamicBloomFilter dbf = createBloomFilter(seed, size); Text bfstr = BloomFilterUtils.serialize(dbf, new Text()); final Text key = new Text(); final Random rnd1 = new Random(seed); for (int i = 0; i < size; i++) { double d = rnd1.nextGaussian(); String s = Double.toHexString(d); key.set(s); Assert.assertEquals("Look up failed for key: " + key, Boolean.TRUE, udf.evaluate(bfstr, key)); } }
Example #9
Source File: BloomContainsAnyUDF.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Nonnull private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { final Filter bloom; if (prevBf != null && prevBfStr.equals(bloomStr)) { bloom = prevBf; } else { try { bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter()); } catch (IOException e) { throw new HiveException(e); } this.prevBfStr = new Text(bloomStr); this.prevBf = bloom; } return bloom; }
Example #10
Source File: BloomContainsUDF.java From incubator-hivemall with Apache License 2.0 | 6 votes |
@Nonnull private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { final Filter bloom; if (prevBf != null && prevBfStr.equals(bloomStr)) { bloom = prevBf; } else { try { bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter()); } catch (IOException e) { throw new HiveException(e); } this.prevBfStr = new Text(bloomStr); this.prevBf = bloom; } return bloom; }
Example #11
Source File: BloomMapFile.java From big-c with Apache License 2.0 | 6 votes |
private void initBloomFilter(Path dirName, Configuration conf) { DataInputStream in = null; try { FileSystem fs = dirName.getFileSystem(conf); in = fs.open(new Path(dirName, BLOOM_FILE_NAME)); bloomFilter = new DynamicBloomFilter(); bloomFilter.readFields(in); in.close(); in = null; } catch (IOException ioe) { LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile."); bloomFilter = null; } finally { IOUtils.closeStream(in); } }
Example #12
Source File: BloomFilterUDAF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
public boolean merge(@Nonnull Text partial) throws HiveException { final DynamicBloomFilter other; try { other = BloomFilterUtils.deserialize(partial, new DynamicBloomFilter()); } catch (IOException e) { throw new HiveException(e); } if (filter == null) { this.filter = other; } else { filter.or(other); } return true; }
Example #13
Source File: BloomFilterUtils.java From incubator-hivemall with Apache License 2.0 | 5 votes |
@Nonnull public static DynamicBloomFilter newDynamicBloomFilter( @Nonnegative final int expectedNumberOfElements, @Nonnegative final float errorRate, @Nonnegative final int nbHash) { int vectorSize = (int) Math.ceil((-nbHash * expectedNumberOfElements) / Math.log(1.d - Math.pow(errorRate, 1.d / nbHash))); return new DynamicBloomFilter(vectorSize, nbHash, Hash.MURMUR_HASH, expectedNumberOfElements); }
Example #14
Source File: BloomFilterUtils.java From incubator-hivemall with Apache License 2.0 | 5 votes |
@Nonnull public static DynamicBloomFilter newDynamicBloomFilter( @Nonnegative final int expectedNumberOfElements, @Nonnegative final float errorRate) { // k = ceil(-log_2(false prob.)) int nbHash = Math.max(2, (int) Math.ceil(-(Math.log(errorRate) / LOG2))); return newDynamicBloomFilter(expectedNumberOfElements, errorRate, nbHash); }
Example #15
Source File: BloomNotUDFTest.java From incubator-hivemall with Apache License 2.0 | 5 votes |
@Test public void test() throws IOException, HiveException { BloomNotUDF udf = new BloomNotUDF(); DynamicBloomFilter bf1 = createBloomFilter(1L, 10000); Text bf1str = BloomFilterUtils.serialize(bf1, new Text()); Text result = udf.evaluate(bf1str); DynamicBloomFilter actual = BloomFilterUtils.deserialize(result, new DynamicBloomFilter()); bf1.not(); Assert.assertEquals(bf1.toString(), actual.toString()); }
Example #16
Source File: BloomMapFile.java From big-c with Apache License 2.0 | 5 votes |
private synchronized void initBloomFilter(Configuration conf) { numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024); // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for // single key, where <code> is the number of hash functions, // <code>n</code> is the number of keys and <code>c</code> is the desired // max. error rate. // Our desired error rate is by default 0.005, i.e. 0.5% float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f); vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) / Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT))); bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT, Hash.getHashType(conf), numKeys); }
Example #17
Source File: BloomMapFile.java From hadoop with Apache License 2.0 | 5 votes |
private synchronized void initBloomFilter(Configuration conf) { numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024); // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for // single key, where <code> is the number of hash functions, // <code>n</code> is the number of keys and <code>c</code> is the desired // max. error rate. // Our desired error rate is by default 0.005, i.e. 0.5% float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f); vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) / Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT))); bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT, Hash.getHashType(conf), numKeys); }
Example #18
Source File: DistinctAggregator.java From compiler with Apache License 2.0 | 5 votes |
/** {@inheritDoc} */ @Override public void start(final EmitKey key) { super.start(key); this.filter = new DynamicBloomFilter(this.vectorSize, HASH_COUNT, Hash.MURMUR_HASH, (int) this.getArg()); }
Example #19
Source File: BloomMapFile.java From RDFS with Apache License 2.0 | 5 votes |
private synchronized void initBloomFilter(Configuration conf) { numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024); // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for // single key, where <code> is the number of hash functions, // <code>n</code> is the number of keys and <code>c</code> is the desired // max. error rate. // Our desired error rate is by default 0.005, i.e. 0.5% float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f); vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) / Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT))); bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT, Hash.getHashType(conf), numKeys); }
Example #20
Source File: BloomMapFile.java From RDFS with Apache License 2.0 | 5 votes |
private void initBloomFilter(FileSystem fs, String dirName, Configuration conf) { try { DataInputStream in = fs.open(new Path(dirName, BLOOM_FILE_NAME)); bloomFilter = new DynamicBloomFilter(); bloomFilter.readFields(in); in.close(); } catch (IOException ioe) { LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile."); bloomFilter = null; } }
Example #21
Source File: BloomMapFile.java From hadoop-gpu with Apache License 2.0 | 5 votes |
private synchronized void initBloomFilter(Configuration conf) { numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024); // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for // single key, where <code> is the number of hash functions, // <code>n</code> is the number of keys and <code>c</code> is the desired // max. error rate. // Our desired error rate is by default 0.005, i.e. 0.5% float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f); vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) / Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT))); bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT, Hash.getHashType(conf), numKeys); }
Example #22
Source File: BloomMapFile.java From hadoop-gpu with Apache License 2.0 | 5 votes |
private void initBloomFilter(FileSystem fs, String dirName, Configuration conf) { try { DataInputStream in = fs.open(new Path(dirName, BLOOM_FILE_NAME)); bloomFilter = new DynamicBloomFilter(); bloomFilter.readFields(in); in.close(); } catch (IOException ioe) { LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile."); bloomFilter = null; } }
Example #23
Source File: BloomFilterUtils.java From incubator-hivemall with Apache License 2.0 | 4 votes |
@Nonnull public static DynamicBloomFilter newDynamicBloomFilter( @Nonnegative final int expectedNumberOfElements) { return newDynamicBloomFilter(expectedNumberOfElements, DEFAULT_ERROR_RATE); }
Example #24
Source File: BloomFilterUtils.java From incubator-hivemall with Apache License 2.0 | 4 votes |
@Nonnull public static DynamicBloomFilter newDynamicBloomFilter() { return newDynamicBloomFilter(DEFAULT_BLOOM_FILTER_SIZE, DEFAULT_ERROR_RATE, NUM_HASHES); }