org.apache.kylin.dict.DictionaryGenerator#buildDictionary

Source File: FragmentFilesMerger.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}

Source File: ColumnarMemoryStorePersister.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = new Stopwatch();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsedMillis()
                + " ms ");
    }
    return dict;
}

Source File: FragmentFilesMerger.java From kylin with Apache License 2.0

6 votes

private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}

Source File: ColumnarMemoryStorePersister.java From kylin with Apache License 2.0

6 votes

private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsed(MILLISECONDS)
                + " ms ");
    }
    return dict;
}

Source File: CubingUtils.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}

Source File: CubingUtils.java From kylin with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}

Source File: MergeDictReducer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}

Source File: MergeDictReducer.java From kylin with Apache License 2.0

4 votes

@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}

Java Code Examples for org.apache.kylin.dict.DictionaryGenerator#buildDictionary()