org.apache.kylin.metadata.model.TblColRef#getIdentity

Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private void outputDict(TblColRef col, Dictionary<String> dict,
        List<Tuple2<String, Tuple3<Writable, Writable, String>>> result)
        throws IOException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos)) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        result.add(new Tuple2<String, Tuple3<Writable, Writable, String>>(BatchConstants.CFG_OUTPUT_DICT,
                new Tuple3<Writable, Writable, String>(NullWritable.get(),
                        new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName)));
    }
}

Source File: SparkFactDistinct.java From kylin with Apache License 2.0

6 votes

private void outputDict(TblColRef col, Dictionary<String> dict,
        List<Tuple2<String, Tuple3<Writable, Writable, String>>> result)
        throws IOException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos)) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        result.add(new Tuple2<String, Tuple3<Writable, Writable, String>>(BatchConstants.CFG_OUTPUT_DICT,
                new Tuple3<Writable, Writable, String>(NullWritable.get(),
                        new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName)));
    }
}

Source File: AggregationGroup.java From kylin with Apache License 2.0

5 votes

private void normalizeColumnNames(String[] names) {
    if (names == null)
        return;

    for (int i = 0; i < names.length; i++) {
        TblColRef col = cubeDesc.getModel().findColumn(names[i]);
        names[i] = col.getIdentity();
    }

    // check no dup
    Set<String> set = new HashSet<>(Arrays.asList(names));
    if (set.size() < names.length)
        throw new IllegalStateException(
                "Columns in aggrgroup must not contain duplication: " + Arrays.asList(names));
}

Source File: FactDistinctColumnsReducer.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos);) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(),
                new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
}

Source File: DictionaryGetterUtil.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> getDictionaryMap(CubeSegment cubeSegment, InputSplit inputSplit,
                                                                  Configuration configuration) throws IOException {
    Map<TblColRef, Dictionary<String>> dictionaryMap = cubeSegment.buildDictionaryMap();

    String shrunkenDictPath = configuration.get(BatchConstants.ARG_SHRUNKEN_DICT_PATH);
    if (shrunkenDictPath == null) {
        return dictionaryMap;
    }

    // replace global dictionary with shrunken dictionary if possible
    String inputSplitSignature = getInputSplitSignature(cubeSegment, inputSplit);
    FileSystem fs = FileSystem.get(configuration);
    ShrunkenDictionary.StringValueSerializer valueSerializer = new ShrunkenDictionary.StringValueSerializer();
    for (TblColRef colRef : cubeSegment.getCubeDesc().getAllGlobalDictColumns()) {
        Path colShrunkenDictDir = new Path(shrunkenDictPath, colRef.getIdentity());
        Path colShrunkenDictPath = new Path(colShrunkenDictDir, inputSplitSignature);
        if (!fs.exists(colShrunkenDictPath)) {
            logger.warn("Shrunken dictionary for column " + colRef.getIdentity() + " in split "
                    + inputSplitSignature + " does not exist!!!");
            continue;
        }
        try (DataInputStream dis = fs.open(colShrunkenDictPath)) {
            Dictionary<String> shrunkenDict = new ShrunkenDictionary(valueSerializer);
            shrunkenDict.readFields(dis);

            dictionaryMap.put(colRef, shrunkenDict);
        }
    }

    return dictionaryMap;
}

Source File: SparkBuildDictionary.java From kylin with Apache License 2.0

5 votes

@Override
public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception {
    if (initialized == false) {
        synchronized (SparkBuildDictionary.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    logger.info("Building dictionary for column {}", tblColRef);
    IReadableTable inpTable = getDistinctValuesFor(tblColRef);
    Dictionary<String> preBuiltDict;
    DictionaryInfo dictInfo;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        preBuiltDict = getDictionary(tblColRef);

        if (preBuiltDict != null) {
            logger.info("Dict for '{}' has already been built, save it", tblColRef.getName());
            dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict);
        } else {
            logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable);
            String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef);
            dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass);
            preBuiltDict = dictInfo.getDictionaryObject();
        }
    }

    return new Tuple2<>(tblColRef.getIdentity(),
            new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId()));
}

Source File: AggregationGroup.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private void normalizeColumnNames(String[] names) {
    if (names == null)
        return;

    for (int i = 0; i < names.length; i++) {
        TblColRef col = cubeDesc.getModel().findColumn(names[i]);
        names[i] = col.getIdentity();
    }

    // check no dup
    Set<String> set = new HashSet<>(Arrays.asList(names));
    if (set.size() < names.length)
        throw new IllegalStateException(
                "Columns in aggrgroup must not contain duplication: " + Arrays.asList(names));
}

Source File: CubeSegment.java From kylin with Apache License 2.0

5 votes

public String getDictResPath(TblColRef col) {
    String r;
    String dictKey = col.getIdentity();
    r = getDictionaries().get(dictKey);

    // try Kylin v1.x dict key as well
    if (r == null) {
        String v1DictKey = col.getTable() + "/" + col.getName();
        r = getDictionaries().get(v1DictKey);
    }

    return r;
}

Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}

Source File: SparkBuildDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Override
public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception {
    if (initialized == false) {
        synchronized (SparkBuildDictionary.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    logger.info("Building dictionary for column {}", tblColRef);
    IReadableTable inpTable = getDistinctValuesFor(tblColRef);
    Dictionary<String> preBuiltDict;
    DictionaryInfo dictInfo;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        preBuiltDict = getDictionary(tblColRef);

        if (preBuiltDict != null) {
            logger.info("Dict for '{}' has already been built, save it", tblColRef.getName());
            dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict);
        } else {
            logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable);
            String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef);
            dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass);
            preBuiltDict = dictInfo.getDictionaryObject();
        }
    }

    return new Tuple2<>(tblColRef.getIdentity(),
            new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId()));
}

Source File: DictionaryGetterUtil.java From kylin with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> getDictionaryMap(CubeSegment cubeSegment, InputSplit inputSplit,
                                                                  Configuration configuration) throws IOException {
    Map<TblColRef, Dictionary<String>> dictionaryMap = cubeSegment.buildDictionaryMap();

    String shrunkenDictPath = configuration.get(BatchConstants.ARG_SHRUNKEN_DICT_PATH);
    if (shrunkenDictPath == null) {
        return dictionaryMap;
    }

    // replace global dictionary with shrunken dictionary if possible
    String inputSplitSignature = getInputSplitSignature(cubeSegment, inputSplit);
    FileSystem fs = FileSystem.get(configuration);
    ShrunkenDictionary.StringValueSerializer valueSerializer = new ShrunkenDictionary.StringValueSerializer();
    for (TblColRef colRef : cubeSegment.getCubeDesc().getAllGlobalDictColumns()) {
        Path colShrunkenDictDir = new Path(shrunkenDictPath, colRef.getIdentity());
        Path colShrunkenDictPath = new Path(colShrunkenDictDir, inputSplitSignature);
        if (!fs.exists(colShrunkenDictPath)) {
            logger.warn("Shrunken dictionary for column " + colRef.getIdentity() + " in split "
                    + inputSplitSignature + " does not exist!!!");
            continue;
        }
        try (DataInputStream dis = fs.open(colShrunkenDictPath)) {
            Dictionary<String> shrunkenDict = new ShrunkenDictionary(valueSerializer);
            shrunkenDict.readFields(dis);

            dictionaryMap.put(colRef, shrunkenDict);
        }
    }

    return dictionaryMap;
}

Source File: UHCDictionaryReducer.java From kylin with Apache License 2.0

5 votes

private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
    mos.close();
}

Source File: FactDistinctColumnsBase.java From kylin with Apache License 2.0

5 votes

private void outputDict(TblColRef col, Dictionary<String> dict, Visitor visitor) throws IOException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        visitor.collect(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(),
                new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
}

Source File: CubeSegment.java From kylin with Apache License 2.0

4 votes

public String removeDictResPath(TblColRef col) {
    String dictKey = col.getIdentity();
    return getDictionaries().remove(dictKey);
}

Source File: UHCDictionaryJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_CUBING_JOB_ID);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_INPUT_PATH);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));

        //add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        attachCubeMetadata(cube, job.getConfiguration());

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();

        //Note! handle uhc columns is null.
        boolean hasUHCValue = false;
        for (TblColRef tblColRef : uhcColumns) {
            Path path = new Path(input.toString() + "/" + tblColRef.getIdentity());
            if (HadoopUtil.getFileSystem(path).exists(path)) {
                FileInputFormat.addInputPath(job, path);
                FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class);
                hasUHCValue = true;
            }
        }

        if (!hasUHCValue) {
            isSkipped = true;
            return 0;
        }

        setJobClasspath(job, cube.getConfig());
        setupMapper();
        setupReducer(output, reducerCount);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);
        job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory());
        job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false");

        //8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice
        job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500");
        job.getConfiguration().set("mapred.reduce.child.java.opts", "-Xmx8g");
        //Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it)
        //Waiting the global dict lock maybe also take a long time.
        //So we set 8 hours here
        job.getConfiguration().set("mapreduce.task.timeout", "28800000");

        //allow user specially set config for uhc step
        for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) {
            job.getConfiguration().set(entry.getKey(), entry.getValue());
        }

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: SparkUHCDictionary.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

Source File: SparkBuildDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public IReadableTable getDistinctValuesFor(TblColRef col) {
    return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType());
}

Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

Source File: CubeSegment.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public void putDictResPath(TblColRef col, String dictResPath) {
    String dictKey = col.getIdentity();
    getDictionaries().put(dictKey, dictResPath);
}

Source File: SparkBuildDictionary.java From kylin with Apache License 2.0

4 votes

public IReadableTable getDistinctValuesFor(TblColRef col) {
    return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType());
}

Java Code Examples for org.apache.kylin.metadata.model.TblColRef#getIdentity()