Java Code Examples for org.apache.kylin.common.KylinConfig#getMrHiveDictColumnsExcludeRefColumns()
The following examples show how to use
org.apache.kylin.common.KylinConfig#getMrHiveDictColumnsExcludeRefColumns() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HiveInputBase.java From kylin with Apache License 2.0 | 6 votes |
@Override public void addStepPhase_ReplaceFlatTableGlobalColumnValue(DefaultChainedExecutable jobFlow) { KylinConfig dictConfig = flatDesc.getSegment().getConfig(); final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams()); String globalDictTable = MRHiveDictUtil.globalDictTableName(flatDesc, cubeName); String globalDictDatabase = dictConfig.getMrHiveDictDB(); String[] mrHiveDictColumnsExcludeRefCols = dictConfig.getMrHiveDictColumnsExcludeRefColumns(); Map<String, String> dictRef = dictConfig.getMrHiveDictRefColumns(); final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase); if (Objects.nonNull(mrHiveDictColumnsExcludeRefCols) && mrHiveDictColumnsExcludeRefCols.length > 0) { jobFlow.addTask(createHiveGlobalDictMergeGlobalDict(flatDesc, hiveInitStatements, cubeName, mrHiveDictColumnsExcludeRefCols, globalDictDatabase, globalDictTable)); for (String item : mrHiveDictColumnsExcludeRefCols) { dictRef.put(item, ""); } } // replace step if (!dictRef.isEmpty()) { jobFlow.addTask(createMrHiveGlobalDictReplaceStep(flatDesc, hiveInitStatements, cubeName, dictRef, flatTableDatabase, globalDictDatabase, globalDictTable, dictConfig.getMrHiveDictTableSuffix(), jobFlow.getId())); } }
Example 2
Source File: BatchCubingJobBuilder2.java From kylin with Apache License 2.0 | 6 votes |
/** * Build hive global dictionary by MR and encode corresponding column into integer for flat table */ protected void buildHiveGlobalDictionaryByMR(final CubingJob result, String jobId) { KylinConfig dictConfig = seg.getConfig(); String[] mrHiveDictColumnExcludeRef = dictConfig.getMrHiveDictColumnsExcludeRefColumns(); String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns(); if (Objects.nonNull(mrHiveDictColumnExcludeRef) && mrHiveDictColumnExcludeRef.length > 0 && !"".equals(mrHiveDictColumnExcludeRef[0])) { // 1. parallel part build result.addTask(createBuildGlobalHiveDictPartBuildJob(jobId)); // 2. parallel total build result.addTask(createBuildGlobalHiveDictTotalBuildJob(jobId)); } // Merge new dictionary entry into global dictionary and replace/encode flat table if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) { inputSide.addStepPhase_ReplaceFlatTableGlobalColumnValue(result); } }
Example 3
Source File: SparkBatchCubingJobBuilder2.java From kylin with Apache License 2.0 | 6 votes |
/** * Build hive global dictionary by MR and encode corresponding column into integer for flat table */ protected void buildHiveGlobalDictionaryByMR(final CubingJob result, String jobId) { KylinConfig dictConfig = seg.getConfig(); String[] mrHiveDictColumnExcludeRef = dictConfig.getMrHiveDictColumnsExcludeRefColumns(); String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns(); if (Objects.nonNull(mrHiveDictColumnExcludeRef) && mrHiveDictColumnExcludeRef.length > 0 && !"".equals(mrHiveDictColumnExcludeRef[0])) { // 1. parallel part build result.addTask(createBuildGlobalHiveDictPartBuildJob(jobId)); // 2. parallel total build result.addTask(createBuildGlobalHiveDictTotalBuildJob(jobId)); } // merge global dic and replace flat table if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) { inputSide.addStepPhase_ReplaceFlatTableGlobalColumnValue(result); } }
Example 4
Source File: HiveInputBase.java From kylin with Apache License 2.0 | 5 votes |
@Override public void addStepPhase1_CreateFlatTable(DefaultChainedExecutable jobFlow) { final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams()); CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName); final KylinConfig cubeConfig = cubeInstance.getConfig(); final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase); // create flat table first addStepPhase1_DoCreateFlatTable(jobFlow); // create hive global dictionary KylinConfig dictConfig = flatDesc.getSegment().getConfig(); String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumnsExcludeRefColumns(); if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) { addStepPhase1_DoCreateMrHiveGlobalDict(jobFlow, mrHiveDictColumns); } // then count and redistribute if (cubeConfig.isHiveRedistributeEnabled()) { final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv(); //jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor())); if (kylinConfig.isLivyEnabled() && cubeInstance.getEngineType() == IEngineAware.ID_SPARK) { jobFlow.addTask(createRedistributeFlatHiveTableByLivyStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor())); } else { jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor())); } } // special for hive addStepPhase1_DoMaterializeLookupTable(jobFlow); }
Example 5
Source File: BuildGlobalHiveDictPartBuildReducer.java From kylin with Apache License 2.0 | 5 votes |
@Override protected void doSetup(Context context) throws IOException, InterruptedException { mos = new MultipleOutputs(context); KylinConfig config; try { config = AbstractHadoopJob.loadKylinPropsAndMetadata(); } catch (IOException e) { throw new RuntimeException(e); } dicCols = config.getMrHiveDictColumnsExcludeRefColumns(); }
Example 6
Source File: BuildGlobalHiveDictPartBuildMapper.java From kylin with Apache License 2.0 | 5 votes |
@Override protected void doSetup(Context context) throws IOException, InterruptedException { tmpbuf = ByteBuffer.allocate(64); KylinConfig config; try { config = AbstractHadoopJob.loadKylinPropsAndMetadata(); } catch (IOException e) { throw new RuntimeException(e); } String[] dicCols = config.getMrHiveDictColumnsExcludeRefColumns(); logger.info("kylin.dictionary.mr-hive.columns: exclude ref cols {}", dicCols); //eg: /user/kylin/warehouse/db/kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by/dict_column=KYLIN_SALES_SELLER_ID/part-000 FileSplit fileSplit = (FileSplit) context.getInputSplit(); //eg: dict_column=KYLIN_SALES_SELLER_ID String name = fileSplit.getPath().getParent().getName(); logger.info("this map file name :{}", name); //eg: KYLIN_SALES_SELLER_ID String colName = name.split("=")[1]; logger.info("this map build col name :{}", colName); for (int i = 0; i < dicCols.length; i++) { if (dicCols[i].equalsIgnoreCase(colName)) { colIndex = i; } } if (colIndex < 0 || colIndex > 127) { logger.error("kylin.dictionary.mr-hive.columns colIndex :{} error ", colIndex); logger.error("kylin.dictionary.mr-hive.columns set error,mr-hive columns's count should less than 128"); } logger.info("this map build col index :{}", colIndex); }
Example 7
Source File: BuildGlobalHiveDictTotalBuildJob.java From kylin with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); String[] dicColsArr = null; try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_INPUT_PATH); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_GLOBAL_DIC_MAX_DISTINCT_COUNT); options.addOption(OPTION_GLOBAL_DIC_PART_REDUCE_STATS); parseOptions(options, args); KylinConfig config = KylinConfig.getInstanceFromEnv(); dicColsArr = config.getMrHiveDictColumnsExcludeRefColumns(); String cubeName = getOptionValue(OPTION_CUBE_NAME); String segmentID = getOptionValue(OPTION_SEGMENT_ID); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); logger.info("Starting: " + job.getJobName()); // ---------------------------------------------------------------------------- // add metadata to distributed cache CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment segment = cube.getSegmentById(segmentID); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); job.getConfiguration().set("partition.statistics.path", getOptionValue(OPTION_GLOBAL_DIC_PART_REDUCE_STATS)); job.getConfiguration().set("last.max.dic.value.path", getOptionValue(OPTION_GLOBAL_DIC_MAX_DISTINCT_COUNT)); job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false); job.setJarByClass(BuildGlobalHiveDictTotalBuildJob.class); setJobClasspath(job, cube.getConfig()); // Mapper job.setMapperClass(BuildGlobalHiveDictTotalBuildMapper.class); // Input Output setInput(job, getOptionValue(OPTION_INPUT_PATH)); setOutput(job, dicColsArr, getOptionValue(OPTION_OUTPUT_PATH)); job.setNumReduceTasks(0);//no reduce job.setInputFormatClass(KeyValueTextInputFormat.class); // prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // delete output Path baseOutputPath = new Path(getOptionValue(OPTION_OUTPUT_PATH)); deletePath(job.getConfiguration(), baseOutputPath); attachSegmentMetadataWithDict(segment, job.getConfiguration()); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 8
Source File: BuildGlobalHiveDictPartBuildJob.java From kylin with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); String[] dicColsArr = null; try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); parseOptions(options, args); KylinConfig config = KylinConfig.getInstanceFromEnv(); dicColsArr = config.getMrHiveDictColumnsExcludeRefColumns(); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); // add metadata to distributed cache String cubeName = getOptionValue(OPTION_CUBE_NAME); String segmentID = getOptionValue(OPTION_SEGMENT_ID); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment segment = cube.getSegmentById(segmentID); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); logger.info("Starting: " + job.getJobName()); job.setJarByClass(BuildGlobalHiveDictPartBuildJob.class); setJobClasspath(job, cube.getConfig()); //FileInputFormat.setInputPaths(job, input); setInput(job, dicColsArr, getInputPath(config, segment)); // make each reducer output to respective dir setOutput(job, dicColsArr, getOptionValue(OPTION_OUTPUT_PATH)); job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false); //set reduce num setReduceNum(job, config); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(BuildGlobalHiveDictPartBuildMapper.class); job.setPartitionerClass(BuildGlobalHiveDictPartPartitioner.class); job.setReducerClass(BuildGlobalHiveDictPartBuildReducer.class); // prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // delete output Path baseOutputPath = new Path(getOptionValue(OPTION_OUTPUT_PATH)); deletePath(job.getConfiguration(), baseOutputPath); attachSegmentMetadataWithDict(segment, job.getConfiguration()); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 9
Source File: BuildGlobalHiveDictTotalBuildMapper.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void doSetup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config; try { config = AbstractHadoopJob.loadKylinPropsAndMetadata(); } catch (IOException e) { throw new RuntimeException(e); } cols = config.getMrHiveDictColumnsExcludeRefColumns(); String statPath = conf.get("partition.statistics.path"); // get the input file name ,the file name format by colIndex-part-partitionNum, eg: 1-part-000019 FileSplit fileSplit = (FileSplit) context.getInputSplit(); String[] arr = fileSplit.getPath().getName().split("-"); int partitionNum = Integer.parseInt(arr[2]); colIndex = Integer.parseInt(arr[0]); colName = cols[colIndex]; logger.info("Input fileName:{}, colIndex:{}, colName:{}, partitionNum:{}", fileSplit.getPath().getName(), colIndex, colName, partitionNum); //last max dic value per column String lastMaxValuePath = conf.get("last.max.dic.value.path"); logger.info("last.max.dic.value.path:" + lastMaxValuePath); long lastMaxDictValue = this.getLastMaxDicValue(conf, lastMaxValuePath); logger.info("last.max.dic.value.path:" + lastMaxValuePath + ",value=" + lastMaxDictValue); // Calculate the starting position of this file, the starting position of this file = sum (count) of all previous numbers + last max dic value of the column Map<Integer, TreeMap<Integer, Long>> allStats = getPartitionsCount(conf, statPath); //<colIndex,<reduceNum,count>> TreeMap<Integer, Long> partitionStats = allStats.get(colIndex); if (partitionNum != 0) { SortedMap<Integer, Long> subStat = partitionStats.subMap(0, true, partitionNum, false); subStat.forEach((k, v) -> { logger.info("Split num:{} and it's count:{}", k, v); start += v; }); } start += lastMaxDictValue; logger.info("global dic.{}.split.num.{} build dict start offset is {}", colName, partitionNum, start); }