com.clearspring.analytics.stream.cardinality.HyperLogLogPlus Java Examples
The following examples show how to use
com.clearspring.analytics.stream.cardinality.HyperLogLogPlus.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FacetTableFunction.java From datawave with Apache License 2.0 | 6 votes |
@Override public Entry<Key,Document> apply(Entry<Key,Value> input) { Key key = input.getKey(); Document newDoc = new Document(); try { String[] fields = StringUtils.split(key.getColumnFamily().toString(), "\u0000"); String[] fieldValues = StringUtils.split(key.getRow().toString(), "\u0000"); FieldValueCardinality fvc = new FieldValueCardinality(HyperLogLogPlus.Builder.build(input.getValue().get())); fvc.setContent(fieldValues[1]); Cardinality card = new Cardinality(fvc, key, false); newDoc.put(fields[1], card); return Maps.immutableEntry(key, newDoc); } catch (IOException e) { throw new RuntimeException(e); } }
Example #2
Source File: StringQuality.java From DataVec with Apache License 2.0 | 6 votes |
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString, long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace, double relativeSD) { /* * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>. * * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting * a nonzero `sp > p` in HyperLogLogPlus(p, sp) would trigger sparse * representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. */ this(countValid, countInvalid, countMissing, countTotal, countEmptyString, countAlphabetic, countNumerical, countWordCharacter, countWhitespace, new HyperLogLogPlus((int) Math.ceil(2.0 * Math.log(1.054 / relativeSD) / Math.log(2)), 0)); }
Example #3
Source File: StatsHyperLogSummaryTest.java From datawave with Apache License 2.0 | 6 votes |
/** * Randomly populates a {@link HyperLogLogPlus} object. */ private HyperLogLogPlus createHyperLog() { Set<String> unique = new HashSet<>(); HyperLogLogPlus logPlus = new HyperLogLogPlus(StatsJob.HYPERLOG_NORMAL_DEFAULT_VALUE, StatsJob.HYPERLOG_SPARSE_DEFAULT_VALUE); this.uniqueCount = rVal.nextInt(MAX_UNIQUE_VALUES - MIN_UNIQUE_VALUES) + MIN_UNIQUE_VALUES; for (int n = 0; n < this.uniqueCount;) { int len = 4 + rVal.nextInt(10); String str = RandomStringUtils.randomAlphabetic(len); if (unique.add(str)) { logPlus.offer(str); n++; } } log.debug("unique strings added to hyper log(" + this.uniqueCount + ")"); // add duplicates List<String> values = new ArrayList<>(unique); int dups = rVal.nextInt(MAX_DUP_VALUES - MIN_DUP_VALUES) + MIN_DUP_VALUES; for (int n = 0; n < dups; n++) { int idx = rVal.nextInt(values.size()); logPlus.offer(values.get(idx)); } return logPlus; }
Example #4
Source File: StatsHyperLogSummaryTest.java From datawave with Apache License 2.0 | 6 votes |
@Test public void testSerialize() throws IOException { for (int n = 0; n < 10; n++) { HyperLogLogPlus logPlus = createHyperLog(); final StatsHyperLogSummary before = new StatsHyperLogSummary(n, logPlus, this.uniqueCount); byte[] bytes = before.toByteArray(); Value value = new Value(bytes); final StatsHyperLogSummary after = new StatsHyperLogSummary(value); log.debug("before(" + before + ")"); log.debug("after(" + after + ")"); Assert.assertEquals(before, after); Assert.assertEquals(0, before.compareTo(after)); Assert.assertEquals(before.getCount(), after.getCount()); HyperLogLogPlus logPlusBefore = before.getHyperLogPlus(); HyperLogLogPlus logPlusAfter = after.getHyperLogPlus(); Assert.assertEquals(logPlusBefore.cardinality(), logPlusAfter.cardinality()); // may not be true for large sample set but for small sample it is correct Assert.assertEquals(this.uniqueCount, logPlusAfter.cardinality()); Assert.assertEquals(this.uniqueCount, after.getUniqueCount()); Assert.assertEquals(this.uniqueCount, before.getUniqueCount()); } }
Example #5
Source File: CardinalityScanner.java From datawave with Apache License 2.0 | 6 votes |
public void addPair(String fieldName, String fieldValue, HyperLogLogPlus hllp) throws Exception { if (fieldName.equals(tuple.getValue0())) { return; } Pair<String,String> p = new Pair<>(fieldName, fieldValue); Long currSum = intersectionSum.get(p); HyperLogLogPlus currHllp = intersectionUnion.get(p); HyperLogLogPlus newHllp = HyperLogLogPlus.Builder.build(hllp.getBytes()); if (currSum == null) { intersectionSum.put(p, baseHllp.cardinality() + hllp.cardinality()); newHllp.addAll(baseHllp); intersectionUnion.put(p, newHllp); } else { intersectionSum.put(p, currSum + hllp.cardinality()); newHllp.addAll(currHllp); intersectionUnion.put(p, newHllp); } }
Example #6
Source File: StringQuality.java From deeplearning4j with Apache License 2.0 | 6 votes |
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString, long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace, double relativeSD) { /* * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>. * * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting * a nonzero `sp > p` in HyperLogLogPlus(p, sp) would trigger sparse * representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. */ this(countValid, countInvalid, countMissing, countTotal, countEmptyString, countAlphabetic, countNumerical, countWordCharacter, countWhitespace, new HyperLogLogPlus((int) Math.ceil(2.0 * Math.log(1.054 / relativeSD) / Math.log(2)), 0)); }
Example #7
Source File: ApproximateDistinctTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testCoder() throws Exception { HyperLogLogPlus hllp = new HyperLogLogPlus(12, 18); for (int i = 0; i < 10; i++) { hllp.offer(i); } CoderProperties.coderDecodeEncodeEqual(ApproximateDistinct.HyperLogLogPlusCoder.of(), hllp); }
Example #8
Source File: StringQualityAddFunction.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public StringQuality apply(StringQuality v1, Writable writable) { long valid = v1.getCountValid(); long invalid = v1.getCountInvalid(); long countMissing = v1.getCountMissing(); long countTotal = v1.getCountTotal() + 1; long empty = v1.getCountEmptyString(); long alphabetic = v1.getCountAlphabetic(); long numerical = v1.getCountNumerical(); long word = v1.getCountWordCharacter(); long whitespaceOnly = v1.getCountWhitespace(); HyperLogLogPlus hll = v1.getHll(); String str = writable.toString(); if (writable instanceof NullWritable) countMissing++; else if (meta.isValid(writable)) valid++; else invalid++; if (str == null || str.isEmpty()) { empty++; } else { if (str.matches("[a-zA-Z]")) alphabetic++; if (str.matches("\\d+")) numerical++; if (str.matches("\\w+")) word++; if (str.matches("\\s+")) whitespaceOnly++; } hll.offer(str); return new StringQuality(valid, invalid, countMissing, countTotal, empty, alphabetic, numerical, word, whitespaceOnly, hll); }
Example #9
Source File: ApproximateDistinct.java From beam with Apache License 2.0 | 5 votes |
@Override public HyperLogLogPlus addInput(HyperLogLogPlus acc, InputT record) { try { acc.offer(CoderUtils.encodeToByteArray(inputCoder, record)); } catch (CoderException e) { throw new IllegalStateException("The input value cannot be encoded: " + e.getMessage(), e); } return acc; }
Example #10
Source File: ApproximateDistinct.java From beam with Apache License 2.0 | 5 votes |
@Override public HyperLogLogPlus mergeAccumulators(Iterable<HyperLogLogPlus> accumulators) { HyperLogLogPlus mergedAccum = createAccumulator(); for (HyperLogLogPlus accum : accumulators) { try { mergedAccum.addAll(accum); } catch (CardinalityMergeException e) { // Should never happen because only HyperLogLogPlus accumulators are instantiated. throw new IllegalStateException( "The accumulators cannot be merged: " + e.getMessage(), e); } } return mergedAccum; }
Example #11
Source File: StringQuality.java From deeplearning4j with Apache License 2.0 | 5 votes |
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString, long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace, HyperLogLogPlus hll) { super(countValid, countInvalid, countMissing, countTotal); this.countEmptyString = countEmptyString; this.countAlphabetic = countAlphabetic; this.countNumerical = countNumerical; this.countWordCharacter = countWordCharacter; this.countWhitespace = countWhitespace; this.hll = hll; }
Example #12
Source File: ApproximateDistinct.java From beam with Apache License 2.0 | 5 votes |
@Override public void encode(HyperLogLogPlus value, OutputStream outStream) throws IOException { if (value == null) { throw new CoderException("cannot encode a null HyperLogLogPlus sketch"); } BYTE_ARRAY_CODER.encode(value.getBytes(), outStream); }
Example #13
Source File: ApproximateDistinct.java From beam with Apache License 2.0 | 5 votes |
@Override protected long getEncodedElementByteSize(HyperLogLogPlus value) throws IOException { if (value == null) { throw new CoderException("cannot encode a null HyperLogLogPlus sketch"); } return value.sizeof(); }
Example #14
Source File: ApproximateDistinct.java From beam with Apache License 2.0 | 5 votes |
private static <K> DoFn<KV<K, HyperLogLogPlus>, KV<K, Long>> perKey() { return new DoFn<KV<K, HyperLogLogPlus>, KV<K, Long>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, HyperLogLogPlus> kv = c.element(); c.output(KV.of(kv.getKey(), kv.getValue().cardinality())); } }; }
Example #15
Source File: ApproximateDistinct.java From beam with Apache License 2.0 | 5 votes |
private static DoFn<HyperLogLogPlus, Long> globally() { return new DoFn<HyperLogLogPlus, Long>() { @ProcessElement public void apply(ProcessContext c) { c.output(c.element().cardinality()); } }; }
Example #16
Source File: FeatureHyperLogLogStatistics.java From geowave with Apache License 2.0 | 5 votes |
@Override public void merge(final Mergeable mergeable) { if (mergeable instanceof FeatureHyperLogLogStatistics) { try { loglog = (HyperLogLogPlus) ((FeatureHyperLogLogStatistics) mergeable).loglog.merge(loglog); } catch (final CardinalityMergeException e) { throw new RuntimeException("Unable to merge counters", e); } } }
Example #17
Source File: CoreOutputManager.java From ffwd with Apache License 2.0 | 5 votes |
/** * To reset cardinality this will swap HLL++ if it was tripped after configured period of ms */ private void swapHyperLogLogPlus() { if (System.currentTimeMillis() - hyperLogSwapTS.get() > hyperLogLogPlusSwapPeriodMS && hyperLogSwapLock.compareAndExchange(false, true)) { hyperLog.set(new HyperLogLogPlus( HYPER_LOG_LOG_PLUS_PRECISION_NORMAL, HYPER_LOG_LOG_PLUS_PRECISION_SPARSE)); hyperLogSwapTS.set(System.currentTimeMillis()); hyperLogSwapLock.set(false); } }
Example #18
Source File: StringQualityAddFunction.java From DataVec with Apache License 2.0 | 5 votes |
@Override public StringQuality call(StringQuality v1, Writable writable) throws Exception { long valid = v1.getCountValid(); long invalid = v1.getCountInvalid(); long countMissing = v1.getCountMissing(); long countTotal = v1.getCountTotal() + 1; long empty = v1.getCountEmptyString(); long alphabetic = v1.getCountAlphabetic(); long numerical = v1.getCountNumerical(); long word = v1.getCountWordCharacter(); long whitespaceOnly = v1.getCountWhitespace(); HyperLogLogPlus hll = v1.getHll(); String str = writable.toString(); if (writable instanceof NullWritable) countMissing++; else if (meta.isValid(writable)) valid++; else invalid++; if (str == null || str.isEmpty()) { empty++; } else { if (str.matches("[a-zA-Z]")) alphabetic++; if (str.matches("\\d+")) numerical++; if (str.matches("\\w+")) word++; if (str.matches("\\s+")) whitespaceOnly++; } hll.offer(str); return new StringQuality(valid, invalid, countMissing, countTotal, empty, alphabetic, numerical, word, whitespaceOnly, hll); }
Example #19
Source File: TopKSampler.java From stratio-cassandra with Apache License 2.0 | 5 votes |
/** * Start to record samples * * @param capacity * Number of sample items to keep in memory, the lower this is * the less accurate results are. For best results use value * close to cardinality, but understand the memory trade offs. */ public synchronized void beginSampling(int capacity) { if (!enabled) { summary = new StreamSummary<T>(capacity); hll = new HyperLogLogPlus(14); enabled = true; } }
Example #20
Source File: SSTableReader.java From stratio-cassandra with Apache License 2.0 | 5 votes |
private static ICardinality mergeCardinalities(Collection<ICardinality> cardinalities) { ICardinality base = new HyperLogLogPlus(13, 25); // see MetadataCollector.cardinality try { base = base.merge(cardinalities.toArray(new ICardinality[cardinalities.size()])); } catch (CardinalityMergeException e) { logger.warn("Could not merge cardinalities", e); } return base; }
Example #21
Source File: CompactionMetadata.java From stratio-cassandra with Apache License 2.0 | 5 votes |
public CompactionMetadata deserialize(Descriptor.Version version, DataInput in) throws IOException { int nbAncestors = in.readInt(); Set<Integer> ancestors = new HashSet<>(nbAncestors); for (int i = 0; i < nbAncestors; i++) ancestors.add(in.readInt()); ICardinality cardinality = HyperLogLogPlus.Builder.build(ByteBufferUtil.readBytes(in, in.readInt())); return new CompactionMetadata(ancestors, cardinality); }
Example #22
Source File: FeatureHyperLogLogStatistics.java From geowave with Apache License 2.0 | 5 votes |
@Override public void fromBinary(final byte[] bytes) { final ByteBuffer buffer = super.binaryBuffer(bytes); final byte[] data = ByteArrayUtils.safeRead(buffer, VarintUtils.readUnsignedInt(buffer)); try { loglog = HyperLogLogPlus.Builder.build(data); } catch (final IOException e) { LOGGER.error("Exception while reading statistic", e); } }
Example #23
Source File: DistinctCountHyperLogLogAggregateFunction.java From phoenix with Apache License 2.0 | 5 votes |
@Override public void aggregate(Tuple tuple, ImmutableBytesWritable ptr) { try { hll.addAll(HyperLogLogPlus.Builder.build(ByteUtil.copyKeyBytesIfNecessary(ptr))); } catch (Exception e) { throw new RuntimeException(e); } }
Example #24
Source File: FeatureHyperLogLogStatistics.java From geowave with Apache License 2.0 | 5 votes |
/** * @param adapterId the adapter ID * @param fieldName the field name * @param precision number of bits to support counting. 2^p is the maximum count value per * distinct value. 1 <= p <= 32 */ public FeatureHyperLogLogStatistics( final Short adapterId, final String fieldName, final int precision) { super(adapterId, STATS_TYPE, fieldName); loglog = new HyperLogLogPlus(precision); this.precision = precision; }
Example #25
Source File: HyperLogFieldSummary.java From datawave with Apache License 2.0 | 5 votes |
@Override public void add(Value value) throws IOException { StatsHyperLogSummary stats = new StatsHyperLogSummary(value); this.count += stats.getCount(); HyperLogLogPlus hllpAdd = stats.getHyperLogPlus(); try { this.logPlus.addAll(hllpAdd); } catch (CardinalityMergeException e) { // addAll throws an out of scope exception throw new IOException(e); } }
Example #26
Source File: Cardinality.java From datawave with Apache License 2.0 | 5 votes |
@Override public void readFields(DataInput in) throws IOException { readMetadata(in); content = new FieldValueCardinality(); content.lower = WritableUtils.readString(in); content.upper = WritableUtils.readString(in); byte[] cardArray = WritableUtils.readCompressedByteArray(in); content.estimate = HyperLogLogPlus.Builder.build(cardArray); }
Example #27
Source File: Cardinality.java From datawave with Apache License 2.0 | 5 votes |
@Override public void read(Kryo kryo, Input input) { super.readMetadata(kryo, input); content = new FieldValueCardinality(); this.content.lower = input.readString(); this.content.upper = input.readString(); int size = input.readInt(); byte[] cardArray = new byte[size]; input.read(cardArray); try { this.content.estimate = HyperLogLogPlus.Builder.build(cardArray); } catch (IOException e) { throw new RuntimeException(e); } }
Example #28
Source File: CardinalityScanner.java From datawave with Apache License 2.0 | 5 votes |
public Set<CardinalityIntersectionRecord> scanCardinalities(List<String> fields, DateAggregationType dateAggregationType, DatatypeAggregationType datatypeAggregationType) throws Exception { Map<CardinalityIntersectionRecord,HyperLogLogPlus> cardinalityMap = new TreeMap<>(); Scanner scanner = null; try { ZooKeeperInstance instance = new ZooKeeperInstance(config.getInstanceName(), config.getZookeepers()); Connector connector = instance.getConnector(config.getUsername(), new PasswordToken(config.getPassword())); Collection<Authorizations> authCollection = Collections.singleton(new Authorizations(config.getAuths().split(","))); if (!connector.tableOperations().exists(config.getTableName())) { throw new IllegalArgumentException("Table " + config.getTableName() + " does not exist"); } scanner = ScannerHelper.createScanner(connector, config.getTableName(), authCollection); Range r = new Range(config.getBeginDate(), config.getEndDate() + "\0"); scanner.setRange(r); Iterator<Map.Entry<Key,Value>> itr = scanner.iterator(); while (itr.hasNext()) { Map.Entry<Key,Value> nextEntry = itr.next(); Key key = nextEntry.getKey(); String field = key.getColumnFamily().toString(); if (fields != null && !fields.isEmpty() && !fields.contains(field)) { continue; } else { addEntry(cardinalityMap, nextEntry, dateAggregationType, datatypeAggregationType); } } } catch (Exception e) { log.error(e); } finally { if (scanner != null) { scanner.close(); } } return cardinalityMap.keySet(); }
Example #29
Source File: StringQuality.java From DataVec with Apache License 2.0 | 5 votes |
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString, long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace, HyperLogLogPlus hll) { super(countValid, countInvalid, countMissing, countTotal); this.countEmptyString = countEmptyString; this.countAlphabetic = countAlphabetic; this.countNumerical = countNumerical; this.countWordCharacter = countWordCharacter; this.countWhitespace = countWhitespace; this.hll = hll; }
Example #30
Source File: StatsHyperLogReducer.java From datawave with Apache License 2.0 | 5 votes |
@Override public void doReduce(BulkIngestKey key, Iterable<Value> values, TaskInputOutputContext<?,?,BulkIngestKey,Value> context) throws IOException, InterruptedException { log.info("reduce key(" + key.getKey() + ")"); this.totalKeys++; HyperLogLogPlus hllp = new HyperLogLogPlus(this.normalPrecision, this.sparsePrecision); HyperLogFieldSummary stats = new HyperLogFieldSummary(hllp); int valueCount = 0; for (Value val : values) { stats.add(val); valueCount++; if (0 == (valueCount % this.valueInterval) || this.countsOnly) { if (this.countsOnly) { StatsHyperLogSummary addStats = new StatsHyperLogSummary(val); log.info("add values(" + addStats.statsString() + ")"); } log.info("value count(" + valueCount + ")"); } } log.info("final stats data(" + stats.toString() + ")"); if (!this.countsOnly) { if (this.minCount <= stats.getCount()) { // write to bulk output StatsCounters counters = stats.toStatsCounters(); // set timestamp Key k = key.getKey(); k.setTimestamp(this.timestamp); writeBulkIngestKey(key, counters.getValue(), context); } else { log.debug("count is less than minimum: " + key.getKey().toString() + ") count(" + stats.getCount() + ")"); } } context.progress(); }