Java Code Examples for org.apache.kylin.dict.DictionaryGenerator#newDictionaryBuilder()
The following examples show how to use
org.apache.kylin.dict.DictionaryGenerator#newDictionaryBuilder() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: UHCDictionaryReducer.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns(); int taskId = context.getTaskAttemptID().getTaskID().getId(); col = uhcColumns.get(taskId); logger.info("column name: " + col.getIdentity()); if (cube.getDescriptor().getShardByColumns().contains(col)) { //for ShardByColumns builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } else { //for GlobalDictionaryColumns String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype()); String builderClass = cubeDesc.getDictionaryBuilderClass(col); builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass); builder.init(dictionaryInfo, 0, hdfsDir); } }
Example 2
Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
@Override public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception { if (initialized == false) { synchronized (SparkFactDistinct.class) { if (initialized == false) { init(); } } } try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos)) { TblColRef col = uhcColumns.get(columnValues._1); logger.info("Processing column " + col.getName()); if (cube.getDescriptor().getShardByColumns().contains(col)) { //for ShardByColumns builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } else { //for GlobalDictionaryColumns DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype()); String builderClass = cubeDesc.getDictionaryBuilderClass(col); builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass); builder.init(dictionaryInfo, 0, hdfsDir); } Iterator<String> values = columnValues._2.iterator(); while (values.hasNext()) { builder.addValue(values.next()); } Dictionary<String> dict = builder.build(); String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; logger.info("Dictionary file name is " + dictFileName); outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3); } }
Example 3
Source File: UHCDictionaryReducer.java From kylin with Apache License 2.0 | 5 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns(); int taskId = context.getTaskAttemptID().getTaskID().getId(); col = uhcColumns.get(taskId); logger.info("column name: " + col.getIdentity()); if (cube.getDescriptor().getShardByColumns().contains(col)) { //for ShardByColumns builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } else { //for GlobalDictionaryColumns String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype()); String builderClass = cubeDesc.getDictionaryBuilderClass(col); builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass); builder.init(dictionaryInfo, 0, hdfsDir); } }
Example 4
Source File: FactDistinctColumnsBase.java From kylin with Apache License 2.0 | 5 votes |
public void setupReduce(int taskId) throws IOException { this.taskId = taskId; try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(envConfig)) { cube = CubeManager.getInstance(envConfig).getCube(cubeName); cubeDesc = cube.getDescriptor(); reducerMapping = new FactDistinctColumnsReducerMapping(cube); logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId)); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); logger.info("Reducer " + taskId + " handling stats"); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); // local build dict buildDictInReducer = envConfig.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer); } } }
Example 5
Source File: SparkUHCDictionary.java From kylin with Apache License 2.0 | 5 votes |
@Override public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception { if (initialized == false) { synchronized (SparkFactDistinct.class) { if (initialized == false) { init(); } } } try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos)) { TblColRef col = uhcColumns.get(columnValues._1); logger.info("Processing column " + col.getName()); if (cube.getDescriptor().getShardByColumns().contains(col)) { //for ShardByColumns builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } else { //for GlobalDictionaryColumns DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype()); String builderClass = cubeDesc.getDictionaryBuilderClass(col); builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass); builder.init(dictionaryInfo, 0, hdfsDir); } Iterator<String> values = columnValues._2.iterator(); while (values.hasNext()) { builder.addValue(values.next()); } Dictionary<String> dict = builder.build(); String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; logger.info("Dictionary file name is " + dictFileName); outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3); } }
Example 6
Source File: FactDistinctColumnsReducer.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); cubeConfig = cube.getConfig(); cubeDesc = cube.getDescriptor(); taskId = context.getTaskAttemptID().getTaskID().getId(); reducerMapping = new FactDistinctColumnsReducerMapping(cube); logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId)); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); samplingPercentage = Integer .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT)); logger.info("Reducer " + taskId + " handling stats"); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); // local build dict buildDictInReducer = config.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer); } }
Example 7
Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
private void init() throws IOException { taskId = TaskContext.getPartitionId(); kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); cubeDesc = cubeInstance.getDescriptor(); cubeConfig = cubeInstance.getConfig(); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); result = Lists.newArrayList(); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); logger.info("Partition {} handling stats", taskId); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare(); isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col); // local build dict buildDictInReducer = kConfig.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer); } initialized = true; } }
Example 8
Source File: FactDistinctColumnsReducer.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); cubeConfig = cube.getConfig(); cubeDesc = cube.getDescriptor(); taskId = context.getTaskAttemptID().getTaskID().getId(); reducerMapping = new FactDistinctColumnsReducerMapping(cube); logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId)); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); samplingPercentage = Integer .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT)); logger.info("Reducer " + taskId + " handling stats"); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); // local build dict buildDictInReducer = config.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer); } }
Example 9
Source File: SparkFactDistinct.java From kylin with Apache License 2.0 | 4 votes |
private void init() throws IOException { taskId = TaskContext.getPartitionId(); kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); cubeDesc = cubeInstance.getDescriptor(); cubeConfig = cubeInstance.getConfig(); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); result = Lists.newArrayList(); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); logger.info("Partition {} handling stats", taskId); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare(); isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col); // local build dict buildDictInReducer = kConfig.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer); } initialized = true; } }