org.apache.kylin.common.KylinConfig#getMrHiveDictColumnsExcludeRefColumns

Source File: HiveInputBase.java From kylin with Apache License 2.0

6 votes

@Override
public void addStepPhase_ReplaceFlatTableGlobalColumnValue(DefaultChainedExecutable jobFlow) {
    KylinConfig dictConfig = flatDesc.getSegment().getConfig();
    final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams());
    String globalDictTable = MRHiveDictUtil.globalDictTableName(flatDesc, cubeName);
    String globalDictDatabase = dictConfig.getMrHiveDictDB();

    String[] mrHiveDictColumnsExcludeRefCols = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    Map<String, String> dictRef = dictConfig.getMrHiveDictRefColumns();
    final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase);

    if (Objects.nonNull(mrHiveDictColumnsExcludeRefCols) && mrHiveDictColumnsExcludeRefCols.length > 0) {
        jobFlow.addTask(createHiveGlobalDictMergeGlobalDict(flatDesc, hiveInitStatements, cubeName, mrHiveDictColumnsExcludeRefCols, globalDictDatabase, globalDictTable));
        for (String item : mrHiveDictColumnsExcludeRefCols) {
            dictRef.put(item, "");
        }
    }

    // replace step
    if (!dictRef.isEmpty()) {
        jobFlow.addTask(createMrHiveGlobalDictReplaceStep(flatDesc, hiveInitStatements, cubeName,
                dictRef, flatTableDatabase, globalDictDatabase, globalDictTable, dictConfig.getMrHiveDictTableSuffix(), jobFlow.getId()));
    }
}

Source File: BatchCubingJobBuilder2.java From kylin with Apache License 2.0

6 votes

/**
 * Build hive global dictionary by MR and encode corresponding column into integer for flat table
 */
protected void buildHiveGlobalDictionaryByMR(final CubingJob result, String jobId) {
    KylinConfig dictConfig = seg.getConfig();
    String[] mrHiveDictColumnExcludeRef = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns();

    if (Objects.nonNull(mrHiveDictColumnExcludeRef) && mrHiveDictColumnExcludeRef.length > 0
            && !"".equals(mrHiveDictColumnExcludeRef[0])) {

        // 1. parallel part build
        result.addTask(createBuildGlobalHiveDictPartBuildJob(jobId));

        // 2. parallel total build
        result.addTask(createBuildGlobalHiveDictTotalBuildJob(jobId));
    }

    // Merge new dictionary entry into global dictionary and replace/encode flat table
    if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) {
        inputSide.addStepPhase_ReplaceFlatTableGlobalColumnValue(result);
    }
}

Source File: SparkBatchCubingJobBuilder2.java From kylin with Apache License 2.0

6 votes

/**
 * Build hive global dictionary by MR and encode corresponding column into integer for flat table
 */
protected void buildHiveGlobalDictionaryByMR(final CubingJob result, String jobId) {
    KylinConfig dictConfig = seg.getConfig();
    String[] mrHiveDictColumnExcludeRef = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns();

    if (Objects.nonNull(mrHiveDictColumnExcludeRef) && mrHiveDictColumnExcludeRef.length > 0
            && !"".equals(mrHiveDictColumnExcludeRef[0])) {

        // 1. parallel part build
        result.addTask(createBuildGlobalHiveDictPartBuildJob(jobId));

        // 2. parallel total build
        result.addTask(createBuildGlobalHiveDictTotalBuildJob(jobId));
    }

    // merge global dic and replace flat table
    if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) {
        inputSide.addStepPhase_ReplaceFlatTableGlobalColumnValue(result);
    }
}

Source File: HiveInputBase.java From kylin with Apache License 2.0

5 votes

@Override
public void addStepPhase1_CreateFlatTable(DefaultChainedExecutable jobFlow) {
    final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams());
    CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName);
    final KylinConfig cubeConfig = cubeInstance.getConfig();

    final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase);

    // create flat table first
    addStepPhase1_DoCreateFlatTable(jobFlow);

    // create hive global dictionary
    KylinConfig dictConfig = flatDesc.getSegment().getConfig();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0
            && !"".equals(mrHiveDictColumns[0])) {
        addStepPhase1_DoCreateMrHiveGlobalDict(jobFlow, mrHiveDictColumns);
    }

    // then count and redistribute
    if (cubeConfig.isHiveRedistributeEnabled()) {
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        //jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor()));
        if (kylinConfig.isLivyEnabled() && cubeInstance.getEngineType() == IEngineAware.ID_SPARK) {
            jobFlow.addTask(createRedistributeFlatHiveTableByLivyStep(hiveInitStatements, cubeName, flatDesc,
                    cubeInstance.getDescriptor()));
        } else {
            jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc,
                    cubeInstance.getDescriptor()));
        }
    }

    // special for hive
    addStepPhase1_DoMaterializeLookupTable(jobFlow);
}

Source File: BuildGlobalHiveDictPartBuildReducer.java From kylin with Apache License 2.0

5 votes

@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    mos = new MultipleOutputs(context);
    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    dicCols = config.getMrHiveDictColumnsExcludeRefColumns();
}

Source File: BuildGlobalHiveDictPartBuildMapper.java From kylin with Apache License 2.0

5 votes

@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    tmpbuf = ByteBuffer.allocate(64);

    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    String[] dicCols = config.getMrHiveDictColumnsExcludeRefColumns();
    logger.info("kylin.dictionary.mr-hive.columns: exclude ref cols {}", dicCols);

    //eg: /user/kylin/warehouse/db/kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by/dict_column=KYLIN_SALES_SELLER_ID/part-000
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    //eg: dict_column=KYLIN_SALES_SELLER_ID
    String name = fileSplit.getPath().getParent().getName();
    logger.info("this map file name :{}", name);

    //eg: KYLIN_SALES_SELLER_ID
    String colName = name.split("=")[1];
    logger.info("this map build col name :{}", colName);

    for (int i = 0; i < dicCols.length; i++) {
        if (dicCols[i].equalsIgnoreCase(colName)) {
            colIndex = i;
        }
    }
    if (colIndex < 0 || colIndex > 127) {
        logger.error("kylin.dictionary.mr-hive.columns colIndex :{} error ", colIndex);
        logger.error("kylin.dictionary.mr-hive.columns set error,mr-hive columns's count should less than 128");
    }
    logger.info("this map build col index :{}", colIndex);

}

Source File: BuildGlobalHiveDictTotalBuildJob.java From kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    String[] dicColsArr = null;
    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_GLOBAL_DIC_MAX_DISTINCT_COUNT);
        options.addOption(OPTION_GLOBAL_DIC_PART_REDUCE_STATS);
        parseOptions(options, args);

        KylinConfig config = KylinConfig.getInstanceFromEnv();
        dicColsArr = config.getMrHiveDictColumnsExcludeRefColumns();
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        logger.info("Starting: " + job.getJobName());

        // ----------------------------------------------------------------------------
        // add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
        job.getConfiguration().set("partition.statistics.path", getOptionValue(OPTION_GLOBAL_DIC_PART_REDUCE_STATS));
        job.getConfiguration().set("last.max.dic.value.path", getOptionValue(OPTION_GLOBAL_DIC_MAX_DISTINCT_COUNT));
        job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false);

        job.setJarByClass(BuildGlobalHiveDictTotalBuildJob.class);

        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(BuildGlobalHiveDictTotalBuildMapper.class);

        // Input Output
        setInput(job, getOptionValue(OPTION_INPUT_PATH));
        setOutput(job, dicColsArr, getOptionValue(OPTION_OUTPUT_PATH));

        job.setNumReduceTasks(0);//no reduce

        job.setInputFormatClass(KeyValueTextInputFormat.class);

        // prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        // delete output
        Path baseOutputPath = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        deletePath(job.getConfiguration(), baseOutputPath);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: BuildGlobalHiveDictPartBuildJob.java From kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    String[] dicColsArr = null;

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        parseOptions(options, args);

        KylinConfig config = KylinConfig.getInstanceFromEnv();
        dicColsArr = config.getMrHiveDictColumnsExcludeRefColumns();

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));

        // add metadata to distributed cache
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        logger.info("Starting: " + job.getJobName());

        job.setJarByClass(BuildGlobalHiveDictPartBuildJob.class);

        setJobClasspath(job, cube.getConfig());

        //FileInputFormat.setInputPaths(job, input);
        setInput(job, dicColsArr, getInputPath(config, segment));

        // make each reducer output to respective dir
        setOutput(job, dicColsArr, getOptionValue(OPTION_OUTPUT_PATH));
        job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false);

        //set reduce num
        setReduceNum(job, config);

        job.setInputFormatClass(KeyValueTextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(BuildGlobalHiveDictPartBuildMapper.class);
        job.setPartitionerClass(BuildGlobalHiveDictPartPartitioner.class);
        job.setReducerClass(BuildGlobalHiveDictPartBuildReducer.class);

        // prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        // delete output
        Path baseOutputPath = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        deletePath(job.getConfiguration(), baseOutputPath);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: BuildGlobalHiveDictTotalBuildMapper.java From kylin with Apache License 2.0

4 votes

@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    cols = config.getMrHiveDictColumnsExcludeRefColumns();


    String statPath = conf.get("partition.statistics.path");

    // get the input file name ,the file name format by colIndex-part-partitionNum, eg: 1-part-000019
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] arr = fileSplit.getPath().getName().split("-");
    int partitionNum = Integer.parseInt(arr[2]);
    colIndex = Integer.parseInt(arr[0]);
    colName = cols[colIndex];
    logger.info("Input fileName:{}, colIndex:{}, colName:{}, partitionNum:{}", fileSplit.getPath().getName(), colIndex, colName, partitionNum);

    //last max dic value per column
    String lastMaxValuePath = conf.get("last.max.dic.value.path");
    logger.info("last.max.dic.value.path:" + lastMaxValuePath);
    long lastMaxDictValue = this.getLastMaxDicValue(conf, lastMaxValuePath);
    logger.info("last.max.dic.value.path:" + lastMaxValuePath + ",value=" + lastMaxDictValue);

    // Calculate the starting position of this file, the starting position of this file = sum (count) of all previous numbers + last max dic value of the column
    Map<Integer, TreeMap<Integer, Long>> allStats = getPartitionsCount(conf, statPath); //<colIndex,<reduceNum,count>>
    TreeMap<Integer, Long> partitionStats = allStats.get(colIndex);
    if (partitionNum != 0) {
        SortedMap<Integer, Long> subStat = partitionStats.subMap(0, true, partitionNum, false);
        subStat.forEach((k, v) -> {
            logger.info("Split num:{} and it's count:{}", k, v);
            start += v;
        });
    }
    start += lastMaxDictValue;
    logger.info("global dic.{}.split.num.{} build dict start offset is {}", colName, partitionNum, start);
}

Java Code Examples for org.apache.kylin.common.KylinConfig#getMrHiveDictColumnsExcludeRefColumns()