Java Code Examples for org.apache.kylin.cube.CubeInstance#getConfig()
The following examples show how to use
org.apache.kylin.cube.CubeInstance#getConfig() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CalculateStatsFromBaseCuboidReducer.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); cubeConfig = cube.getConfig(); baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newLinkedList(); output = conf.get(BatchConstants.CFG_OUTPUT_PATH); samplingPercentage = Integer .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT)); taskId = context.getTaskAttemptID().getTaskID().getId(); cuboidHLLMap = Maps.newHashMap(); }
Example 2
Source File: HiveInputBase.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
@Override public void addStepPhase1_CreateFlatTable(DefaultChainedExecutable jobFlow) { final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams()); CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName); final KylinConfig cubeConfig = cubeInstance.getConfig(); final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase); // create flat table first addStepPhase1_DoCreateFlatTable(jobFlow); // create global dict KylinConfig dictConfig = (flatDesc.getSegment()).getConfig(); String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns(); if (mrHiveDictColumns.length > 0) { String globalDictDatabase = dictConfig.getMrHiveDictDB(); if (null == globalDictDatabase) { throw new IllegalArgumentException("Mr-Hive Global dict database is null."); } String globalDictTable = cubeName + dictConfig.getMrHiveDictTableSuffix(); addStepPhase1_DoCreateMrHiveGlobalDict(jobFlow, mrHiveDictColumns, globalDictDatabase, globalDictTable); } // then count and redistribute if (cubeConfig.isHiveRedistributeEnabled()) { final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv(); //jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor())); if (kylinConfig.isLivyEnabled() && cubeInstance.getEngineType() == IEngineAware.ID_SPARK) { jobFlow.addTask(createRedistributeFlatHiveTableByLivyStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor())); } else { jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor())); } } // special for hive addStepPhase1_DoMaterializeLookupTable(jobFlow); }
Example 3
Source File: MergeDictionaryStep.java From kylin with Apache License 2.0 | 5 votes |
@Override protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException { final CubeManager mgr = CubeManager.getInstance(context.getConfig()); final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams())); final CubeSegment newSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams())); final List<CubeSegment> mergingSegments = getMergingSegments(cube); KylinConfig conf = cube.getConfig(); Collections.sort(mergingSegments); try { checkLookupSnapshotsMustIncremental(mergingSegments); // work on copy instead of cached objects CubeInstance cubeCopy = cube.latestCopyForWrite(); CubeSegment newSegCopy = cubeCopy.getSegmentById(newSegment.getUuid()); makeDictForNewSegment(conf, cubeCopy, newSegCopy, mergingSegments); makeSnapshotForNewSegment(cubeCopy, newSegCopy, mergingSegments); CubeUpdate update = new CubeUpdate(cubeCopy); update.setToUpdateSegs(newSegCopy); mgr.updateCube(update); return ExecuteResult.createSucceed(); } catch (IOException e) { logger.error("fail to merge dictionary or lookup snapshots", e); return ExecuteResult.createError(e); } }
Example 4
Source File: JobStepFactory.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
public static NSparkExecutable addStep(DefaultChainedExecutable parent, JobStepType type, CubeInstance cube) { NSparkExecutable step; KylinConfig config = cube.getConfig(); switch (type) { case RESOURCE_DETECT: step = new NResourceDetectStep(parent); break; case CUBING: step = new NSparkCubingStep(config.getSparkBuildClassName()); break; case MERGING: step = new NSparkMergingStep(config.getSparkMergeClassName()); break; case CLEAN_UP_AFTER_MERGE: step = new NSparkUpdateMetaAndCleanupAfterMergeStep(); break; default: throw new IllegalArgumentException(); } step.setParams(parent.getParams()); step.setProject(parent.getProject()); step.setTargetSubject(parent.getTargetSubject()); if (step instanceof NSparkUpdateMetaAndCleanupAfterMergeStep) { CubeSegment mergeSegment = cube.getSegmentById(parent.getTargetSegments().iterator().next()); final Segments<CubeSegment> mergingSegments = cube.getMergingSegments(mergeSegment); step.setParam(MetadataConstants.P_SEGMENT_NAMES, String.join(",", NSparkCubingUtil.toSegmentNames(mergingSegments))); step.setParam(CubingExecutableUtil.SEGMENT_ID, parent.getParam(CubingExecutableUtil.SEGMENT_ID)); step.setParam(MetadataConstants.P_JOB_TYPE, parent.getParam(MetadataConstants.P_JOB_TYPE)); step.setParam(MetadataConstants.P_OUTPUT_META_URL, parent.getParam(MetadataConstants.P_OUTPUT_META_URL)); } parent.addTask(step); //after addTask, step's id is changed step.setDistMetaUrl(config.getJobTmpMetaStoreUrl(parent.getProject(), step.getId())); return step; }
Example 5
Source File: LookupSnapshotJobBuilder.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
public LookupSnapshotJobBuilder(CubeInstance cube, String lookupTable, List<String> segments, String submitter) { this.cube = cube; this.lookupTable = lookupTable; this.segments = segments; this.submitter = submitter; this.kylinConfig = cube.getConfig(); }
Example 6
Source File: CuboidRecommenderUtil.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
/** Trigger cube planner phase two for optimization */ public static Map<Long, Long> getRecommendCuboidList(CubeInstance cube, Map<Long, Long> hitFrequencyMap, Map<Long, Map<Long, Pair<Long, Long>>> rollingUpCountSourceMap) throws IOException { CuboidScheduler cuboidScheduler = cube.getCuboidScheduler(); Set<Long> currentCuboids = cuboidScheduler.getAllCuboidIds(); Pair<Map<Long, Long>, Map<Long, Double>> statsPair = CuboidStatsReaderUtil .readCuboidStatsAndSizeFromCube(currentCuboids, cube); long baseCuboid = cuboidScheduler.getBaseCuboidId(); if (statsPair.getFirst().get(baseCuboid) == null || statsPair.getFirst().get(baseCuboid) == 0L) { logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO); return null; } KylinConfig config = cube.getConfig(); String key = cube.getName(); double queryUncertaintyRatio = config.getCubePlannerQueryUncertaintyRatio(); double bpusMinBenefitRatio = config.getCubePlannerBPUSMinBenefitRatio(); CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, statsPair.getFirst(), statsPair.getSecond()) { @Override public Map<Long, Double> estimateCuboidsSize(Map<Long, Long> statistics) { try { return CuboidStatsReaderUtil.readCuboidSizeFromCube(statistics, cube); } catch (IOException e) { logger.warn("Fail to get cuboid size from cube due to ", e); return null; } } }.setQueryUncertaintyRatio(queryUncertaintyRatio) // .setBPUSMinBenefitRatio(bpusMinBenefitRatio) // .setHitFrequencyMap(hitFrequencyMap) // .setRollingUpCountSourceMap(rollingUpCountSourceMap) // .build(); return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, config); }
Example 7
Source File: CuboidRecommenderUtil.java From kylin with Apache License 2.0 | 5 votes |
/** Trigger cube planner phase two for optimization */ public static Map<Long, Long> getRecommendCuboidList(CubeInstance cube, Map<Long, Long> hitFrequencyMap, Map<Long, Map<Long, Pair<Long, Long>>> rollingUpCountSourceMap) throws IOException { CuboidScheduler cuboidScheduler = cube.getCuboidScheduler(); Set<Long> currentCuboids = cuboidScheduler.getAllCuboidIds(); Pair<Map<Long, Long>, Map<Long, Double>> statsPair = CuboidStatsReaderUtil .readCuboidStatsAndSizeFromCube(currentCuboids, cube); long baseCuboid = cuboidScheduler.getBaseCuboidId(); if (statsPair.getFirst().get(baseCuboid) == null || statsPair.getFirst().get(baseCuboid) == 0L) { logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO); return null; } KylinConfig config = cube.getConfig(); String key = cube.getName(); double queryUncertaintyRatio = config.getCubePlannerQueryUncertaintyRatio(); double bpusMinBenefitRatio = config.getCubePlannerBPUSMinBenefitRatio(); CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, statsPair.getFirst(), statsPair.getSecond()) { @Override public Map<Long, Double> estimateCuboidsSize(Map<Long, Long> statistics) { try { return CuboidStatsReaderUtil.readCuboidSizeFromCube(statistics, cube); } catch (IOException e) { logger.warn("Fail to get cuboid size from cube due to ", e); return null; } } }.setQueryUncertaintyRatio(queryUncertaintyRatio) // .setBPUSMinBenefitRatio(bpusMinBenefitRatio) // .setHitFrequencyMap(hitFrequencyMap) // .setRollingUpCountSourceMap(rollingUpCountSourceMap) // .build(); return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, config); }
Example 8
Source File: FactDistinctColumnsReducer.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); cubeConfig = cube.getConfig(); cubeDesc = cube.getDescriptor(); taskId = context.getTaskAttemptID().getTaskID().getId(); reducerMapping = new FactDistinctColumnsReducerMapping(cube); logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId)); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); samplingPercentage = Integer .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT)); logger.info("Reducer " + taskId + " handling stats"); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); // local build dict buildDictInReducer = config.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer); } }
Example 9
Source File: HBaseLookupMRSteps.java From kylin with Apache License 2.0 | 4 votes |
public HBaseLookupMRSteps(CubeInstance cube) { this.cube = cube; this.config = new JobEngineConfig(cube.getConfig()); }
Example 10
Source File: MergeStatisticsWithOldStep.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException { final CubeManager mgr = CubeManager.getInstance(context.getConfig()); final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams())); final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams())); CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment); Preconditions.checkNotNull(oldSegment, "cannot find the original segment to be optimized by " + optimizeSegment); KylinConfig kylinConf = cube.getConfig(); Configuration conf = HadoopUtil.getCurrentConfiguration(); ResourceStore rs = ResourceStore.getStore(kylinConf); int averageSamplingPercentage = 0; try { //1. Add statistics from optimized segment Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams())); FileSystem hdfs = FileSystem.get(conf); if (!hdfs.exists(statisticsDirPath)) { throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists"); } if (!hdfs.isDirectory(statisticsDirPath)) { throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory"); } Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath, BatchConstants.CFG_OUTPUT_STATISTICS); if (statisticsFiles == null) { throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath); } for (Path item : statisticsFiles) { CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null, optimizeSegment.getConfig(), item); averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage(); addFromCubeStatsReader(optimizeSegmentStatsReader); } //2. Add statistics from old segment CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig()); averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage(); addFromCubeStatsReader(oldSegmentStatsReader); logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString()); //3. Store merged statistics for recommend cuboids averageSamplingPercentage = averageSamplingPercentage / 2; Set<Long> cuboidsRecommend = cube.getCuboidsRecommend(); Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size()); for (Long cuboid : cuboidsRecommend) { HLLCounter hll = cuboidHLLMap.get(cuboid); if (hll == null) { logger.warn("Cannot get the row count stats for cuboid " + cuboid); } else { resultCuboidHLLMap.put(cuboid, hll); } } String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams()); CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap, averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount()); try (FSDataInputStream mergedStats = hdfs .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) { // put the statistics to metadata store String statisticsFileName = optimizeSegment.getStatisticsResourcePath(); rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis()); } //By default, the cube optimization will use in-memory cubing CubingJob cubingJob = (CubingJob) getManager() .getJob(CubingExecutableUtil.getCubingJobId(this.getParams())); StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment); return new ExecuteResult(); } catch (IOException e) { logger.error("fail to merge cuboid statistics", e); return ExecuteResult.createError(e); } }
Example 11
Source File: SparkExecutableLivy.java From kylin with Apache License 2.0 | 4 votes |
@SuppressWarnings("checkstyle:methodlength") @Override protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException { ExecutableManager mgr = getManager(); Map<String, String> extra = mgr.getOutput(getId()).getExtra(); String sparkJobId = extra.get(ExecutableConstants.SPARK_JOB_ID); if (!StringUtils.isEmpty(sparkJobId)) { return onResumed(sparkJobId, mgr); } else { String cubeName = this.getParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt()); CubeInstance cube = CubeManager.getInstance(context.getConfig()).getCube(cubeName); final KylinConfig config = cube.getConfig(); setAlgorithmLayer(); LivyRestBuilder livyRestBuilder = new LivyRestBuilder(); String segmentID = this.getParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt()); CubeSegment segment = cube.getSegmentById(segmentID); Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment); dumpMetadata(segment, mergingSeg); Map<String, String> sparkConfs = config.getSparkConfigOverride(); String sparkConfigName = getSparkConfigName(); if (sparkConfigName != null) { Map<String, String> sparkSpecificConfs = config.getSparkConfigOverrideWithSpecificName(sparkConfigName); sparkConfs.putAll(sparkSpecificConfs); } for (Map.Entry<String, String> entry : sparkConfs.entrySet()) { if (entry.getKey().equals("spark.submit.deployMode") || entry.getKey().equals("spark.master") || entry.getKey().equals("spark.yarn.archive")) { continue; } else { livyRestBuilder.addConf(entry.getKey(), entry.getValue()); } } formatArgs(livyRestBuilder.getArgs()); final LivyRestExecutor executor = new LivyRestExecutor(); final PatternedLogger patternedLogger = new PatternedLogger(logger, (infoKey, info) -> { // only care three properties here if (ExecutableConstants.SPARK_JOB_ID.equals(infoKey) || ExecutableConstants.YARN_APP_ID.equals(infoKey) || ExecutableConstants.YARN_APP_URL.equals(infoKey)) { getManager().addJobInfo(getId(), info); } }); try { livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.job); executor.execute(livyRestBuilder, patternedLogger); if (isDiscarded()) { return new ExecuteResult(ExecuteResult.State.DISCARDED, "Discarded"); } if (isPaused()) { return new ExecuteResult(ExecuteResult.State.STOPPED, "Stopped"); } // done, update all properties Map<String, String> joblogInfo = patternedLogger.getInfo(); // read counter from hdfs String counterOutput = getParam(BatchConstants.ARG_COUNTER_OUTPUT); if (counterOutput != null) { if (HadoopUtil.getWorkingFileSystem().exists(new Path(counterOutput))) { Map<String, String> counterMap = HadoopUtil.readFromSequenceFile(counterOutput); joblogInfo.putAll(counterMap); } else { logger.warn("Spark counter output path not exists: " + counterOutput); } } readCounters(joblogInfo); getManager().addJobInfo(getId(), joblogInfo); return new ExecuteResult(ExecuteResult.State.SUCCEED, patternedLogger.getBufferedLog()); } catch (Exception e) { logger.error("error run spark job:", e); // clear SPARK_JOB_ID on job failure. extra = mgr.getOutput(getId()).getExtra(); extra.put(ExecutableConstants.SPARK_JOB_ID, ""); getManager().addJobInfo(getId(), extra); return new ExecuteResult(ExecuteResult.State.ERROR, e.getMessage()); } } }
Example 12
Source File: HBaseLookupMRSteps.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public HBaseLookupMRSteps(CubeInstance cube) { this.cube = cube; this.config = new JobEngineConfig(cube.getConfig()); }
Example 13
Source File: Cuboid.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public static void clearCache(CubeInstance cubeInstance) { KylinConfig config = cubeInstance.getConfig(); CuboidManager.getInstance(config).clearCache(cubeInstance); }
Example 14
Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
private void init() throws IOException { taskId = TaskContext.getPartitionId(); kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); cubeDesc = cubeInstance.getDescriptor(); cubeConfig = cubeInstance.getConfig(); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); result = Lists.newArrayList(); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); logger.info("Partition {} handling stats", taskId); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare(); isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col); // local build dict buildDictInReducer = kConfig.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer); } initialized = true; } }
Example 15
Source File: SparkExecutableLivy.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@SuppressWarnings("checkstyle:methodlength") @Override protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException { ExecutableManager mgr = getManager(); Map<String, String> extra = mgr.getOutput(getId()).getExtra(); String sparkJobId = extra.get(ExecutableConstants.SPARK_JOB_ID); if (!StringUtils.isEmpty(sparkJobId)) { return onResumed(sparkJobId, mgr); } else { String cubeName = this.getParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt()); CubeInstance cube = CubeManager.getInstance(context.getConfig()).getCube(cubeName); final KylinConfig config = cube.getConfig(); setAlgorithmLayer(); LivyRestBuilder livyRestBuilder = new LivyRestBuilder(); String segmentID = this.getParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt()); CubeSegment segment = cube.getSegmentById(segmentID); Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment); dumpMetadata(segment, mergingSeg); Map<String, String> sparkConfs = config.getSparkConfigOverride(); String sparkConfigName = getSparkConfigName(); if (sparkConfigName != null) { Map<String, String> sparkSpecificConfs = config.getSparkConfigOverrideWithSpecificName(sparkConfigName); sparkConfs.putAll(sparkSpecificConfs); } for (Map.Entry<String, String> entry : sparkConfs.entrySet()) { if (entry.getKey().equals("spark.submit.deployMode") || entry.getKey().equals("spark.master") || entry.getKey().equals("spark.yarn.archive")) { continue; } else { livyRestBuilder.addConf(entry.getKey(), entry.getValue()); } } formatArgs(livyRestBuilder.getArgs()); final LivyRestExecutor executor = new LivyRestExecutor(); final PatternedLogger patternedLogger = new PatternedLogger(logger, (infoKey, info) -> { // only care three properties here if (ExecutableConstants.SPARK_JOB_ID.equals(infoKey) || ExecutableConstants.YARN_APP_ID.equals(infoKey) || ExecutableConstants.YARN_APP_URL.equals(infoKey)) { getManager().addJobInfo(getId(), info); } }); try { livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.job); executor.execute(livyRestBuilder, patternedLogger); if (isDiscarded()) { return new ExecuteResult(ExecuteResult.State.DISCARDED, "Discarded"); } if (isPaused()) { return new ExecuteResult(ExecuteResult.State.STOPPED, "Stopped"); } // done, update all properties Map<String, String> joblogInfo = patternedLogger.getInfo(); // read counter from hdfs String counterOutput = getParam(BatchConstants.ARG_COUNTER_OUTPUT); if (counterOutput != null) { if (HadoopUtil.getWorkingFileSystem().exists(new Path(counterOutput))) { Map<String, String> counterMap = HadoopUtil.readFromSequenceFile(counterOutput); joblogInfo.putAll(counterMap); } else { logger.warn("Spark counter output path not exists: " + counterOutput); } } readCounters(joblogInfo); getManager().addJobInfo(getId(), joblogInfo); return new ExecuteResult(ExecuteResult.State.SUCCEED, patternedLogger.getBufferedLog()); } catch (Exception e) { logger.error("error run spark job:", e); // clear SPARK_JOB_ID on job failure. extra = mgr.getOutput(getId()).getExtra(); extra.put(ExecutableConstants.SPARK_JOB_ID, ""); getManager().addJobInfo(getId(), extra); return new ExecuteResult(ExecuteResult.State.ERROR, e.getMessage()); } } }
Example 16
Source File: SparkFactDistinct.java From kylin with Apache License 2.0 | 4 votes |
private void init() throws IOException { taskId = TaskContext.getPartitionId(); kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); cubeDesc = cubeInstance.getDescriptor(); cubeConfig = cubeInstance.getConfig(); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); result = Lists.newArrayList(); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); logger.info("Partition {} handling stats", taskId); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare(); isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col); // local build dict buildDictInReducer = kConfig.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer); } initialized = true; } }
Example 17
Source File: AbstractExecutable.java From kylin with Apache License 2.0 | 4 votes |
public KylinConfig getCubeSpecificConfig() { String cubeName = getCubeName(); CubeManager manager = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = manager.getCube(cubeName); return cube.getConfig(); }
Example 18
Source File: UpdateDictionaryStep.java From kylin with Apache License 2.0 | 4 votes |
@Override protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException { final CubeManager cubeMgr = CubeManager.getInstance(context.getConfig()); final DictionaryManager dictMgrHdfs; final DictionaryManager dictMgrHbase; final CubeInstance cube = cubeMgr.getCube(CubingExecutableUtil.getCubeName(this.getParams())); final CubeSegment newSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams())); final List<CubeSegment> mergingSegments = getMergingSegments(cube); final String dictInfoPath = this.getParams().get(BatchConstants.ARG_DICT_PATH); final String metadataUrl = this.getParams().get(BatchConstants.ARG_META_URL); final KylinConfig kylinConfHbase = cube.getConfig(); final KylinConfig kylinConfHdfs = AbstractHadoopJob.loadKylinConfigFromHdfs(metadataUrl); Collections.sort(mergingSegments); try { Configuration conf = HadoopUtil.getCurrentConfiguration(); FileSystem fs = HadoopUtil.getWorkingFileSystem(); ResourceStore hbaseRS = ResourceStore.getStore(kylinConfHbase); ResourceStore hdfsRS = ResourceStore.getStore(kylinConfHdfs); dictMgrHdfs = DictionaryManager.getInstance(kylinConfHdfs); dictMgrHbase = DictionaryManager.getInstance(kylinConfHbase); // work on copy instead of cached objects CubeInstance cubeCopy = cube.latestCopyForWrite(); CubeSegment newSegCopy = cubeCopy.getSegmentById(newSegment.getUuid()); // update cube segment dictionary FileStatus[] fileStatuss = fs.listStatus(new Path(dictInfoPath), new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith("part") || path.getName().startsWith("tmp"); } }); for (FileStatus fileStatus : fileStatuss) { Path filePath = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, conf); Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Text value = (Text) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(key, value)) { String tblCol = key.toString(); String dictInfoResource = value.toString(); if (StringUtils.isNotEmpty(dictInfoResource)) { logger.info(dictInfoResource); // put dictionary file to metadata store DictionaryInfo dictInfoHdfs = dictMgrHdfs.getDictionaryInfo(dictInfoResource); DictionaryInfo dicInfoHbase = dictMgrHbase.trySaveNewDict(dictInfoHdfs.getDictionaryObject(), dictInfoHdfs); if (dicInfoHbase != null){ TblColRef tblColRef = cube.getDescriptor().findColumnRef(tblCol.split(":")[0], tblCol.split(":")[1]); newSegCopy.putDictResPath(tblColRef, dicInfoHbase.getResourcePath()); } } } IOUtils.closeStream(reader); } CubeSegment lastSeg = mergingSegments.get(mergingSegments.size() - 1); for (Map.Entry<String, String> entry : lastSeg.getSnapshots().entrySet()) { newSegCopy.putSnapshotResPath(entry.getKey(), entry.getValue()); } // update statistics // put the statistics to metadata store String statisticsFileName = newSegment.getStatisticsResourcePath(); hbaseRS.putResource(statisticsFileName, hdfsRS.getResource(newSegment.getStatisticsResourcePath()).content(), System.currentTimeMillis()); CubeUpdate update = new CubeUpdate(cubeCopy); update.setToUpdateSegs(newSegCopy); cubeMgr.updateCube(update); return ExecuteResult.createSucceed(); } catch (IOException e) { logger.error("fail to merge dictionary", e); return ExecuteResult.createError(e); } }
Example 19
Source File: MergeStatisticsWithOldStep.java From kylin with Apache License 2.0 | 4 votes |
@Override protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException { final CubeManager mgr = CubeManager.getInstance(context.getConfig()); final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams())); final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams())); CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment); Preconditions.checkNotNull(oldSegment, "cannot find the original segment to be optimized by " + optimizeSegment); KylinConfig kylinConf = cube.getConfig(); Configuration conf = HadoopUtil.getCurrentConfiguration(); ResourceStore rs = ResourceStore.getStore(kylinConf); int averageSamplingPercentage = 0; try { //1. Add statistics from optimized segment Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams())); FileSystem hdfs = FileSystem.get(conf); if (!hdfs.exists(statisticsDirPath)) { throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists"); } if (!hdfs.isDirectory(statisticsDirPath)) { throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory"); } Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath, BatchConstants.CFG_OUTPUT_STATISTICS); if (statisticsFiles == null) { throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath); } for (Path item : statisticsFiles) { CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null, optimizeSegment.getConfig(), item); averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage(); addFromCubeStatsReader(optimizeSegmentStatsReader); } //2. Add statistics from old segment CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig()); averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage(); addFromCubeStatsReader(oldSegmentStatsReader); logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString()); //3. Store merged statistics for recommend cuboids averageSamplingPercentage = averageSamplingPercentage / 2; Set<Long> cuboidsRecommend = cube.getCuboidsRecommend(); Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size()); for (Long cuboid : cuboidsRecommend) { HLLCounter hll = cuboidHLLMap.get(cuboid); if (hll == null) { logger.warn("Cannot get the row count stats for cuboid " + cuboid); } else { resultCuboidHLLMap.put(cuboid, hll); } } String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams()); CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap, averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount()); try (FSDataInputStream mergedStats = hdfs .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) { // put the statistics to metadata store String statisticsFileName = optimizeSegment.getStatisticsResourcePath(); rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis()); } //By default, the cube optimization will use in-memory cubing CubingJob cubingJob = (CubingJob) getManager() .getJob(CubingExecutableUtil.getCubingJobId(this.getParams())); StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment); return new ExecuteResult(); } catch (IOException e) { logger.error("fail to merge cuboid statistics", e); return ExecuteResult.createError(e); } }
Example 20
Source File: FactDistinctColumnsReducer.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); cubeConfig = cube.getConfig(); cubeDesc = cube.getDescriptor(); taskId = context.getTaskAttemptID().getTaskID().getId(); reducerMapping = new FactDistinctColumnsReducerMapping(cube); logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId)); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); samplingPercentage = Integer .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT)); logger.info("Reducer " + taskId + " handling stats"); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); // local build dict buildDictInReducer = config.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer); } }