Java Code Examples for org.apache.kylin.metadata.model.TblColRef#getIdentity()
The following examples show how to use
org.apache.kylin.metadata.model.TblColRef#getIdentity() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
private void outputDict(TblColRef col, Dictionary<String> dict, List<Tuple2<String, Tuple3<Writable, Writable, String>>> result) throws IOException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos)) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); result.add(new Tuple2<String, Tuple3<Writable, Writable, String>>(BatchConstants.CFG_OUTPUT_DICT, new Tuple3<Writable, Writable, String>(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName))); } }
Example 2
Source File: SparkFactDistinct.java From kylin with Apache License 2.0 | 6 votes |
private void outputDict(TblColRef col, Dictionary<String> dict, List<Tuple2<String, Tuple3<Writable, Writable, String>>> result) throws IOException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos)) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); result.add(new Tuple2<String, Tuple3<Writable, Writable, String>>(BatchConstants.CFG_OUTPUT_DICT, new Tuple3<Writable, Writable, String>(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName))); } }
Example 3
Source File: AggregationGroup.java From kylin with Apache License 2.0 | 5 votes |
private void normalizeColumnNames(String[] names) { if (names == null) return; for (int i = 0; i < names.length; i++) { TblColRef col = cubeDesc.getModel().findColumn(names[i]); names[i] = col.getIdentity(); } // check no dup Set<String> set = new HashSet<>(Arrays.asList(names)); if (set.size() < names.length) throw new IllegalStateException( "Columns in aggrgroup must not contain duplication: " + Arrays.asList(names)); }
Example 4
Source File: FactDistinctColumnsReducer.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); } }
Example 5
Source File: DictionaryGetterUtil.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
public static Map<TblColRef, Dictionary<String>> getDictionaryMap(CubeSegment cubeSegment, InputSplit inputSplit, Configuration configuration) throws IOException { Map<TblColRef, Dictionary<String>> dictionaryMap = cubeSegment.buildDictionaryMap(); String shrunkenDictPath = configuration.get(BatchConstants.ARG_SHRUNKEN_DICT_PATH); if (shrunkenDictPath == null) { return dictionaryMap; } // replace global dictionary with shrunken dictionary if possible String inputSplitSignature = getInputSplitSignature(cubeSegment, inputSplit); FileSystem fs = FileSystem.get(configuration); ShrunkenDictionary.StringValueSerializer valueSerializer = new ShrunkenDictionary.StringValueSerializer(); for (TblColRef colRef : cubeSegment.getCubeDesc().getAllGlobalDictColumns()) { Path colShrunkenDictDir = new Path(shrunkenDictPath, colRef.getIdentity()); Path colShrunkenDictPath = new Path(colShrunkenDictDir, inputSplitSignature); if (!fs.exists(colShrunkenDictPath)) { logger.warn("Shrunken dictionary for column " + colRef.getIdentity() + " in split " + inputSplitSignature + " does not exist!!!"); continue; } try (DataInputStream dis = fs.open(colShrunkenDictPath)) { Dictionary<String> shrunkenDict = new ShrunkenDictionary(valueSerializer); shrunkenDict.readFields(dis); dictionaryMap.put(colRef, shrunkenDict); } } return dictionaryMap; }
Example 6
Source File: SparkBuildDictionary.java From kylin with Apache License 2.0 | 5 votes |
@Override public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception { if (initialized == false) { synchronized (SparkBuildDictionary.class) { if (initialized == false) { init(); } } } logger.info("Building dictionary for column {}", tblColRef); IReadableTable inpTable = getDistinctValuesFor(tblColRef); Dictionary<String> preBuiltDict; DictionaryInfo dictInfo; try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(config)) { preBuiltDict = getDictionary(tblColRef); if (preBuiltDict != null) { logger.info("Dict for '{}' has already been built, save it", tblColRef.getName()); dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict); } else { logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable); String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef); dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass); preBuiltDict = dictInfo.getDictionaryObject(); } } return new Tuple2<>(tblColRef.getIdentity(), new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId())); }
Example 7
Source File: AggregationGroup.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
private void normalizeColumnNames(String[] names) { if (names == null) return; for (int i = 0; i < names.length; i++) { TblColRef col = cubeDesc.getModel().findColumn(names[i]); names[i] = col.getIdentity(); } // check no dup Set<String> set = new HashSet<>(Arrays.asList(names)); if (set.size() < names.length) throw new IllegalStateException( "Columns in aggrgroup must not contain duplication: " + Arrays.asList(names)); }
Example 8
Source File: CubeSegment.java From kylin with Apache License 2.0 | 5 votes |
public String getDictResPath(TblColRef col) { String r; String dictKey = col.getIdentity(); r = getDictionaries().get(dictKey); // try Kylin v1.x dict key as well if (r == null) { String v1DictKey = col.getTable() + "/" + col.getName(); r = getDictionaries().get(v1DictKey); } return r; }
Example 9
Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
@Override public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception { if (initialized == false) { synchronized (SparkFactDistinct.class) { if (initialized == false) { init(); } } } try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos)) { TblColRef col = uhcColumns.get(columnValues._1); logger.info("Processing column " + col.getName()); if (cube.getDescriptor().getShardByColumns().contains(col)) { //for ShardByColumns builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } else { //for GlobalDictionaryColumns DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype()); String builderClass = cubeDesc.getDictionaryBuilderClass(col); builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass); builder.init(dictionaryInfo, 0, hdfsDir); } Iterator<String> values = columnValues._2.iterator(); while (values.hasNext()) { builder.addValue(values.next()); } Dictionary<String> dict = builder.build(); String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; logger.info("Dictionary file name is " + dictFileName); outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3); } }
Example 10
Source File: SparkBuildDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
@Override public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception { if (initialized == false) { synchronized (SparkBuildDictionary.class) { if (initialized == false) { init(); } } } logger.info("Building dictionary for column {}", tblColRef); IReadableTable inpTable = getDistinctValuesFor(tblColRef); Dictionary<String> preBuiltDict; DictionaryInfo dictInfo; try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(config)) { preBuiltDict = getDictionary(tblColRef); if (preBuiltDict != null) { logger.info("Dict for '{}' has already been built, save it", tblColRef.getName()); dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict); } else { logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable); String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef); dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass); preBuiltDict = dictInfo.getDictionaryObject(); } } return new Tuple2<>(tblColRef.getIdentity(), new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId())); }
Example 11
Source File: DictionaryGetterUtil.java From kylin with Apache License 2.0 | 5 votes |
public static Map<TblColRef, Dictionary<String>> getDictionaryMap(CubeSegment cubeSegment, InputSplit inputSplit, Configuration configuration) throws IOException { Map<TblColRef, Dictionary<String>> dictionaryMap = cubeSegment.buildDictionaryMap(); String shrunkenDictPath = configuration.get(BatchConstants.ARG_SHRUNKEN_DICT_PATH); if (shrunkenDictPath == null) { return dictionaryMap; } // replace global dictionary with shrunken dictionary if possible String inputSplitSignature = getInputSplitSignature(cubeSegment, inputSplit); FileSystem fs = FileSystem.get(configuration); ShrunkenDictionary.StringValueSerializer valueSerializer = new ShrunkenDictionary.StringValueSerializer(); for (TblColRef colRef : cubeSegment.getCubeDesc().getAllGlobalDictColumns()) { Path colShrunkenDictDir = new Path(shrunkenDictPath, colRef.getIdentity()); Path colShrunkenDictPath = new Path(colShrunkenDictDir, inputSplitSignature); if (!fs.exists(colShrunkenDictPath)) { logger.warn("Shrunken dictionary for column " + colRef.getIdentity() + " in split " + inputSplitSignature + " does not exist!!!"); continue; } try (DataInputStream dis = fs.open(colShrunkenDictPath)) { Dictionary<String> shrunkenDict = new ShrunkenDictionary(valueSerializer); shrunkenDict.readFields(dis); dictionaryMap.put(colRef, shrunkenDict); } } return dictionaryMap; }
Example 12
Source File: UHCDictionaryReducer.java From kylin with Apache License 2.0 | 5 votes |
private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); } mos.close(); }
Example 13
Source File: FactDistinctColumnsBase.java From kylin with Apache License 2.0 | 5 votes |
private void outputDict(TblColRef col, Dictionary<String> dict, Visitor visitor) throws IOException { // output written to baseDir/colName/colName.rldict-r-00000 (etc) String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos)) { outputStream.writeUTF(dict.getClass().getName()); dict.write(outputStream); visitor.collect(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName); } }
Example 14
Source File: CubeSegment.java From kylin with Apache License 2.0 | 4 votes |
public String removeDictResPath(TblColRef col) { String dictKey = col.getIdentity(); return getDictionaries().remove(dictKey); }
Example 15
Source File: UHCDictionaryJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_CUBING_JOB_ID); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_INPUT_PATH); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String job_id = getOptionValue(OPTION_CUBING_JOB_ID); String cubeName = getOptionValue(OPTION_CUBE_NAME); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); //add metadata to distributed cache CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); attachCubeMetadata(cube, job.getConfiguration()); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); //Note! handle uhc columns is null. boolean hasUHCValue = false; for (TblColRef tblColRef : uhcColumns) { Path path = new Path(input.toString() + "/" + tblColRef.getIdentity()); if (HadoopUtil.getFileSystem(path).exists(path)) { FileInputFormat.addInputPath(job, path); FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class); hasUHCValue = true; } } if (!hasUHCValue) { isSkipped = true; return 0; } setJobClasspath(job, cube.getConfig()); setupMapper(); setupReducer(output, reducerCount); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id); job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()); job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false"); //8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500"); job.getConfiguration().set("mapred.reduce.child.java.opts", "-Xmx8g"); //Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it) //Waiting the global dict lock maybe also take a long time. //So we set 8 hours here job.getConfiguration().set("mapreduce.task.timeout", "28800000"); //allow user specially set config for uhc step for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) { job.getConfiguration().set(entry.getKey(), entry.getValue()); } return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 16
Source File: SparkUHCDictionary.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter"); final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf); KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); final Job job = Job.getInstance(sConf.get()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); if (reducerCount == 0) { return; } logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerCount); logger.info("counter path {}", counterPath); JavaPairRDD<String, String> wholeSequenceFileNames = null; for (TblColRef tblColRef : uhcColumns) { String columnPath = inputPath + "/" + tblColRef.getIdentity(); if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) { continue; } if (wholeSequenceFileNames == null) { wholeSequenceFileNames = sc.wholeTextFiles(columnPath); } else { wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath)); } } if (wholeSequenceFileNames == null) { logger.error("There're no sequence files at " + inputPath + " !"); return; } JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1) .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns)) .filter(tuple -> tuple._1 != -1) .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2)) .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns)); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); logger.info("Map input records={}", reducerCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
Example 17
Source File: SparkBuildDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public IReadableTable getDistinctValuesFor(TblColRef col) { return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType()); }
Example 18
Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter"); final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf); KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); final Job job = Job.getInstance(sConf.get()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); if (reducerCount == 0) { return; } logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerCount); logger.info("counter path {}", counterPath); JavaPairRDD<String, String> wholeSequenceFileNames = null; for (TblColRef tblColRef : uhcColumns) { String columnPath = inputPath + "/" + tblColRef.getIdentity(); if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) { continue; } if (wholeSequenceFileNames == null) { wholeSequenceFileNames = sc.wholeTextFiles(columnPath); } else { wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath)); } } if (wholeSequenceFileNames == null) { logger.error("There're no sequence files at " + inputPath + " !"); return; } JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1) .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns)) .filter(tuple -> tuple._1 != -1) .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2)) .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns)); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); logger.info("Map input records={}", reducerCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
Example 19
Source File: CubeSegment.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public void putDictResPath(TblColRef col, String dictResPath) { String dictKey = col.getIdentity(); getDictionaries().put(dictKey, dictResPath); }
Example 20
Source File: SparkBuildDictionary.java From kylin with Apache License 2.0 | 4 votes |
public IReadableTable getDistinctValuesFor(TblColRef col) { return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType()); }