Java Code Examples for org.apache.kylin.cube.CubeSegment#getCubeDesc()
The following examples show how to use
org.apache.kylin.cube.CubeSegment#getCubeDesc() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CubeStatsReader.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
private static Map<Long, Double> getCuboidSizeMapFromRowCount(CubeSegment cubeSegment, Map<Long, Long> rowCountMap, long sourceRowCount, boolean origin) { final CubeDesc cubeDesc = cubeSegment.getCubeDesc(); final List<Integer> rowkeyColumnSize = Lists.newArrayList(); final Cuboid baseCuboid = Cuboid.getBaseCuboid(cubeDesc); final List<TblColRef> columnList = baseCuboid.getColumns(); final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap(); final Long baseCuboidRowCount = rowCountMap.get(baseCuboid.getId()); for (int i = 0; i < columnList.size(); i++) { rowkeyColumnSize.add(dimEncMap.get(columnList.get(i)).getLengthOfEncoding()); } Map<Long, Double> sizeMap = Maps.newHashMap(); for (Map.Entry<Long, Long> entry : rowCountMap.entrySet()) { sizeMap.put(entry.getKey(), estimateCuboidStorageSize(cubeSegment, entry.getKey(), entry.getValue(), baseCuboid.getId(), baseCuboidRowCount, rowkeyColumnSize, sourceRowCount)); } if (origin == false && cubeSegment.getConfig().enableJobCuboidSizeOptimize()) { optimizeSizeMap(sizeMap, cubeSegment); } return sizeMap; }
Example 2
Source File: CubeStatsReader.java From kylin with Apache License 2.0 | 6 votes |
private static Map<Long, Double> getCuboidSizeMapFromRowCount(CubeSegment cubeSegment, Map<Long, Long> rowCountMap, long sourceRowCount, boolean origin) { final CubeDesc cubeDesc = cubeSegment.getCubeDesc(); final List<Integer> rowkeyColumnSize = Lists.newArrayList(); final Cuboid baseCuboid = Cuboid.getBaseCuboid(cubeDesc); final List<TblColRef> columnList = baseCuboid.getColumns(); final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap(); final Long baseCuboidRowCount = rowCountMap.get(baseCuboid.getId()); for (int i = 0; i < columnList.size(); i++) { rowkeyColumnSize.add(dimEncMap.get(columnList.get(i)).getLengthOfEncoding()); } Map<Long, Double> sizeMap = Maps.newHashMap(); for (Map.Entry<Long, Long> entry : rowCountMap.entrySet()) { sizeMap.put(entry.getKey(), estimateCuboidStorageSize(cubeSegment, entry.getKey(), entry.getValue(), baseCuboid.getId(), baseCuboidRowCount, rowkeyColumnSize, sourceRowCount)); } if (origin == false && cubeSegment.getConfig().enableJobCuboidSizeOptimize()) { optimizeSizeMap(sizeMap, cubeSegment); } return sizeMap; }
Example 3
Source File: CubingJobBuilder.java From Kylin with Apache License 2.0 | 5 votes |
Pair<AbstractExecutable, AbstractExecutable> addCubingSteps(CubeSegment seg, String cuboidRootPath, CubingJob result) { final int groupRowkeyColumnsCount = seg.getCubeDesc().getRowkey().getNCuboidBuildLevels(); final int totalRowkeyColumnsCount = seg.getCubeDesc().getRowkey().getRowKeyColumns().length; final String jobId = result.getId(); final CubeJoinedFlatTableDesc intermediateTableDesc = new CubeJoinedFlatTableDesc(seg.getCubeDesc(), seg); final String intermediateHiveTableName = getIntermediateHiveTableName(intermediateTableDesc, jobId); final String intermediateHiveTableLocation = getIntermediateHiveTableLocation(intermediateTableDesc, jobId); final String factDistinctColumnsPath = getFactDistinctColumnsPath(seg, jobId); final String[] cuboidOutputTempPath = getCuboidOutputPaths(cuboidRootPath, totalRowkeyColumnsCount, groupRowkeyColumnsCount); final AbstractExecutable intermediateHiveTableStep = createIntermediateHiveTableStep(intermediateTableDesc, jobId); result.addTask(intermediateHiveTableStep); result.addTask(createFactDistinctColumnsStep(seg, intermediateHiveTableName, jobId)); result.addTask(createBuildDictionaryStep(seg, factDistinctColumnsPath)); // base cuboid step final MapReduceExecutable baseCuboidStep = createBaseCuboidStep(seg, intermediateHiveTableLocation, cuboidOutputTempPath); result.addTask(baseCuboidStep); // n dim cuboid steps for (int i = 1; i <= groupRowkeyColumnsCount; i++) { int dimNum = totalRowkeyColumnsCount - i; result.addTask(createNDimensionCuboidStep(seg, cuboidOutputTempPath, dimNum, totalRowkeyColumnsCount)); } return new Pair<AbstractExecutable, AbstractExecutable>(intermediateHiveTableStep, baseCuboidStep); }
Example 4
Source File: RowKeySplitter.java From kylin with Apache License 2.0 | 5 votes |
public RowKeySplitter(CubeSegment cubeSeg, int splitLen, int bytesLen) { this.enableSharding = cubeSeg.isEnableSharding(); this.cubeDesc = cubeSeg.getCubeDesc(); IDimensionEncodingMap dimEncoding = new CubeDimEncMap(cubeSeg); for (RowKeyColDesc rowKeyColDesc : cubeDesc.getRowkey().getRowKeyColumns()) { dimEncoding.get(rowKeyColDesc.getColRef()); } this.colIO = new RowKeyColumnIO(dimEncoding); this.splitBuffers = new ByteArray[splitLen]; this.splitOffsets = new int[splitLen]; this.bufferSize = 0; }
Example 5
Source File: RowKeySplitter.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
public RowKeySplitter(CubeSegment cubeSeg, int splitLen, int bytesLen) { this.enableSharding = cubeSeg.isEnableSharding(); this.cubeDesc = cubeSeg.getCubeDesc(); IDimensionEncodingMap dimEncoding = new CubeDimEncMap(cubeSeg); for (RowKeyColDesc rowKeyColDesc : cubeDesc.getRowkey().getRowKeyColumns()) { dimEncoding.get(rowKeyColDesc.getColRef()); } this.colIO = new RowKeyColumnIO(dimEncoding); this.splitBuffers = new ByteArray[splitLen]; this.splitOffsets = new int[splitLen]; this.bufferSize = 0; }
Example 6
Source File: KafkaInputBase.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
public BaseBatchCubingInputSide(CubeSegment seg, IJoinedFlatTableDesc flatDesc) { this.conf = new JobEngineConfig(KylinConfig.getInstanceFromEnv()); this.config = seg.getConfig(); this.flatDesc = flatDesc; this.hiveTableDatabase = config.getHiveDatabaseForIntermediateTable(); this.seg = seg; this.cubeDesc = seg.getCubeDesc(); this.cubeName = seg.getCubeInstance().getName(); }
Example 7
Source File: CuboidSchedulerUtil.java From kylin with Apache License 2.0 | 5 votes |
public static CuboidScheduler getCuboidScheduler(CubeSegment segment, Set<Long> cuboidSet) { try { Map<Long, Long> cuboidsWithRowCnt = CuboidStatsReaderUtil.readCuboidStatsFromSegment(cuboidSet, segment); Comparator<Long> comparator = cuboidsWithRowCnt == null ? Cuboid.cuboidSelectComparator : new TreeCuboidScheduler.CuboidCostComparator(cuboidsWithRowCnt); return new TreeCuboidScheduler(segment.getCubeDesc(), Lists.newArrayList(cuboidSet), comparator); } catch (IOException e) { throw new RuntimeException("Fail to cube stats for segment" + segment + " due to " + e); } }
Example 8
Source File: RowKeyDecoder.java From Kylin with Apache License 2.0 | 4 votes |
public RowKeyDecoder(CubeSegment cubeSegment) { this.cubeDesc = cubeSegment.getCubeDesc(); this.rowKeySplitter = new RowKeySplitter(cubeSegment, 65, 255); this.colIO = new RowKeyColumnIO(cubeSegment); this.values = new ArrayList<String>(); }
Example 9
Source File: CubeDimEncMap.java From kylin with Apache License 2.0 | 4 votes |
public CubeDimEncMap(CubeSegment seg) { this.cubeDesc = seg.getCubeDesc(); this.seg = seg; this.dictionaryMap = null; }
Example 10
Source File: RowKeyDecoder.java From kylin with Apache License 2.0 | 4 votes |
public RowKeyDecoder(CubeSegment cubeSegment) { this.cubeDesc = cubeSegment.getCubeDesc(); this.rowKeySplitter = new RowKeySplitter(cubeSegment); this.colIO = new RowKeyColumnIO(cubeSegment.getDimensionEncodingMap()); this.values = new ArrayList<String>(); }
Example 11
Source File: CubeJoinedFlatTableDesc.java From kylin with Apache License 2.0 | 4 votes |
public CubeJoinedFlatTableDesc(CubeSegment cubeSegment, boolean includingDerived) { this(cubeSegment.getCubeDesc(), cubeSegment, includingDerived); }
Example 12
Source File: CubeJoinedFlatTableDesc.java From kylin with Apache License 2.0 | 4 votes |
public CubeJoinedFlatTableDesc(CubeSegment cubeSegment) { this(cubeSegment.getCubeDesc(), cubeSegment, false); }
Example 13
Source File: CreateHTableJob.java From kylin with Apache License 2.0 | 4 votes |
public static byte[][] getRegionSplitsFromCuboidStatistics(final Map<Long, Double> cubeSizeMap, final KylinConfig kylinConfig, final CubeSegment cubeSegment, final Path hfileSplitsOutputFolder) throws IOException { final CubeDesc cubeDesc = cubeSegment.getCubeDesc(); float cut = cubeDesc.getConfig().getKylinHBaseRegionCut(); logger.info("Cut for HBase region is {} GB", cut); double totalSizeInM = 0; for (Double cuboidSize : cubeSizeMap.values()) { totalSizeInM += cuboidSize; } List<Long> allCuboids = Lists.newArrayList(); allCuboids.addAll(cubeSizeMap.keySet()); Collections.sort(allCuboids); int nRegion = Math.round((float) (totalSizeInM / (cut * 1024L))); nRegion = Math.max(kylinConfig.getHBaseRegionCountMin(), nRegion); nRegion = Math.min(kylinConfig.getHBaseRegionCountMax(), nRegion); if (cubeSegment.isEnableSharding()) { //use prime nRegions to help random sharding int original = nRegion; if (nRegion == 0) { nRegion = 1; } if (nRegion > Short.MAX_VALUE) { logger.info("Too many regions! reduce to {}", Short.MAX_VALUE); nRegion = Short.MAX_VALUE; } if (nRegion != original) { logger.info("Region count is adjusted from {} to {} to help random sharding", original, nRegion); } } int mbPerRegion = (int) (totalSizeInM / nRegion); mbPerRegion = Math.max(1, mbPerRegion); logger.info("Total size {} M (estimated)", totalSizeInM); logger.info("Expecting {} regions.", nRegion); logger.info("Expecting {} MB per region.", mbPerRegion); if (cubeSegment.isEnableSharding()) { //each cuboid will be split into different number of shards HashMap<Long, Short> cuboidShards = Maps.newHashMap(); //each shard/region may be split into multiple hfiles; array index: region ID, Map: key: cuboidID, value cuboid size in the region List<HashMap<Long, Double>> innerRegionSplits = Lists.newArrayList(); for (int i = 0; i < nRegion; i++) { innerRegionSplits.add(new HashMap<Long, Double>()); } double[] regionSizes = new double[nRegion]; for (long cuboidId : allCuboids) { double estimatedSize = cubeSizeMap.get(cuboidId); double magic = 23; int shardNum = (int) (estimatedSize * magic / mbPerRegion + 1); if (shardNum < 1) { shardNum = 1; } if (shardNum > nRegion) { logger.debug(String.format(Locale.ROOT, "Cuboid %d 's estimated size %.2f MB will generate %d regions, " + "reduce to %d", cuboidId, estimatedSize, shardNum, nRegion)); shardNum = nRegion; } else { logger.debug( String.format(Locale.ROOT, "Cuboid %d 's estimated size %.2f MB will generate %d regions", cuboidId, estimatedSize, shardNum)); } cuboidShards.put(cuboidId, (short) shardNum); short startShard = ShardingHash.getShard(cuboidId, nRegion); for (short i = startShard; i < startShard + shardNum; ++i) { short j = (short) (i % nRegion); regionSizes[j] = regionSizes[j] + estimatedSize / shardNum; innerRegionSplits.get(j).put(cuboidId, estimatedSize / shardNum); } } for (int i = 0; i < nRegion; ++i) { logger.debug("Region {}'s estimated size is {} MB, accounting for {} percent", i, regionSizes[i], 100.0 * regionSizes[i] / totalSizeInM); } CuboidShardUtil.saveCuboidShards(cubeSegment, cuboidShards, nRegion); saveHFileSplits(innerRegionSplits, mbPerRegion, hfileSplitsOutputFolder, kylinConfig); return getSplitsByRegionCount(nRegion); } else { throw new IllegalStateException("Not supported"); } }
Example 14
Source File: MapReduceUtil.java From kylin with Apache License 2.0 | 4 votes |
/** * @param cuboidScheduler specified can provide more flexibility * */ public static int getLayeredCubingReduceTaskNum(CubeSegment cubeSegment, CuboidScheduler cuboidScheduler, double totalMapInputMB, int level) throws ClassNotFoundException, IOException, InterruptedException, JobException { CubeDesc cubeDesc = cubeSegment.getCubeDesc(); KylinConfig kylinConfig = cubeDesc.getConfig(); double perReduceInputMB = kylinConfig.getDefaultHadoopJobReducerInputMB(); double reduceCountRatio = kylinConfig.getDefaultHadoopJobReducerCountRatio(); logger.info("Having per reduce MB " + perReduceInputMB + ", reduce count ratio " + reduceCountRatio + ", level " + level); CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, cuboidScheduler, kylinConfig); double parentLayerSizeEst, currentLayerSizeEst, adjustedCurrentLayerSizeEst; if (level == -1) { //merge case double estimatedSize = cubeStatsReader.estimateCubeSize(); adjustedCurrentLayerSizeEst = estimatedSize > totalMapInputMB ? totalMapInputMB : estimatedSize; logger.debug("estimated size {}, input size {}, adjustedCurrentLayerSizeEst: {}", estimatedSize, totalMapInputMB, adjustedCurrentLayerSizeEst); } else if (level == 0) { //base cuboid case TODO: the estimation could be very WRONG because it has no correction adjustedCurrentLayerSizeEst = cubeStatsReader.estimateLayerSize(0); logger.debug("adjustedCurrentLayerSizeEst: {}", adjustedCurrentLayerSizeEst); } else { parentLayerSizeEst = cubeStatsReader.estimateLayerSize(level - 1); currentLayerSizeEst = cubeStatsReader.estimateLayerSize(level); adjustedCurrentLayerSizeEst = totalMapInputMB / parentLayerSizeEst * currentLayerSizeEst; logger.debug( "totalMapInputMB: {}, parentLayerSizeEst: {}, currentLayerSizeEst: {}, adjustedCurrentLayerSizeEst: {}", totalMapInputMB, parentLayerSizeEst, currentLayerSizeEst, adjustedCurrentLayerSizeEst); } // number of reduce tasks int numReduceTasks = (int) Math.round(adjustedCurrentLayerSizeEst / perReduceInputMB * reduceCountRatio + 0.99); // adjust reducer number for cube which has DISTINCT_COUNT measures for better performance if (cubeDesc.hasMemoryHungryMeasures()) { logger.debug("Multiply reducer num by 4 to boost performance for memory hungry measures"); numReduceTasks = numReduceTasks * 4; } // at least 1 reducer by default numReduceTasks = Math.max(kylinConfig.getHadoopJobMinReducerNumber(), numReduceTasks); // no more than 500 reducer by default numReduceTasks = Math.min(kylinConfig.getHadoopJobMaxReducerNumber(), numReduceTasks); return numReduceTasks; }
Example 15
Source File: CubeDimEncMap.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public CubeDimEncMap(CubeSegment seg) { this.cubeDesc = seg.getCubeDesc(); this.seg = seg; this.dictionaryMap = null; }
Example 16
Source File: RowKeyDecoder.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public RowKeyDecoder(CubeSegment cubeSegment) { this.cubeDesc = cubeSegment.getCubeDesc(); this.rowKeySplitter = new RowKeySplitter(cubeSegment); this.colIO = new RowKeyColumnIO(cubeSegment.getDimensionEncodingMap()); this.values = new ArrayList<String>(); }
Example 17
Source File: CubeJoinedFlatTableDesc.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public CubeJoinedFlatTableDesc(CubeSegment cubeSegment, boolean includingDerived) { this(cubeSegment.getCubeDesc(), cubeSegment, includingDerived); }
Example 18
Source File: CubeJoinedFlatTableDesc.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public CubeJoinedFlatTableDesc(CubeSegment cubeSegment) { this(cubeSegment.getCubeDesc(), cubeSegment, false); }
Example 19
Source File: CreateHTableJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public static byte[][] getRegionSplitsFromCuboidStatistics(final Map<Long, Double> cubeSizeMap, final KylinConfig kylinConfig, final CubeSegment cubeSegment, final Path hfileSplitsOutputFolder) throws IOException { final CubeDesc cubeDesc = cubeSegment.getCubeDesc(); float cut = cubeDesc.getConfig().getKylinHBaseRegionCut(); logger.info("Cut for HBase region is {} GB", cut); double totalSizeInM = 0; for (Double cuboidSize : cubeSizeMap.values()) { totalSizeInM += cuboidSize; } List<Long> allCuboids = Lists.newArrayList(); allCuboids.addAll(cubeSizeMap.keySet()); Collections.sort(allCuboids); int nRegion = Math.round((float) (totalSizeInM / (cut * 1024L))); nRegion = Math.max(kylinConfig.getHBaseRegionCountMin(), nRegion); nRegion = Math.min(kylinConfig.getHBaseRegionCountMax(), nRegion); if (cubeSegment.isEnableSharding()) { //use prime nRegions to help random sharding int original = nRegion; if (nRegion == 0) { nRegion = 1; } if (nRegion > Short.MAX_VALUE) { logger.info("Too many regions! reduce to {}", Short.MAX_VALUE); nRegion = Short.MAX_VALUE; } if (nRegion != original) { logger.info("Region count is adjusted from {} to {} to help random sharding", original, nRegion); } } int mbPerRegion = (int) (totalSizeInM / nRegion); mbPerRegion = Math.max(1, mbPerRegion); logger.info("Total size {} M (estimated)", totalSizeInM); logger.info("Expecting {} regions.", nRegion); logger.info("Expecting {} MB per region.", mbPerRegion); if (cubeSegment.isEnableSharding()) { //each cuboid will be split into different number of shards HashMap<Long, Short> cuboidShards = Maps.newHashMap(); //each shard/region may be split into multiple hfiles; array index: region ID, Map: key: cuboidID, value cuboid size in the region List<HashMap<Long, Double>> innerRegionSplits = Lists.newArrayList(); for (int i = 0; i < nRegion; i++) { innerRegionSplits.add(new HashMap<Long, Double>()); } double[] regionSizes = new double[nRegion]; for (long cuboidId : allCuboids) { double estimatedSize = cubeSizeMap.get(cuboidId); double magic = 23; int shardNum = (int) (estimatedSize * magic / mbPerRegion + 1); if (shardNum < 1) { shardNum = 1; } if (shardNum > nRegion) { logger.debug(String.format(Locale.ROOT, "Cuboid %d 's estimated size %.2f MB will generate %d regions, " + "reduce to %d", cuboidId, estimatedSize, shardNum, nRegion)); shardNum = nRegion; } else { logger.debug( String.format(Locale.ROOT, "Cuboid %d 's estimated size %.2f MB will generate %d regions", cuboidId, estimatedSize, shardNum)); } cuboidShards.put(cuboidId, (short) shardNum); short startShard = ShardingHash.getShard(cuboidId, nRegion); for (short i = startShard; i < startShard + shardNum; ++i) { short j = (short) (i % nRegion); regionSizes[j] = regionSizes[j] + estimatedSize / shardNum; innerRegionSplits.get(j).put(cuboidId, estimatedSize / shardNum); } } for (int i = 0; i < nRegion; ++i) { logger.debug("Region {}'s estimated size is {} MB, accounting for {} percent", i, regionSizes[i], 100.0 * regionSizes[i] / totalSizeInM); } CuboidShardUtil.saveCuboidShards(cubeSegment, cuboidShards, nRegion); saveHFileSplits(innerRegionSplits, mbPerRegion, hfileSplitsOutputFolder, kylinConfig); return getSplitsByRegionCount(nRegion); } else { throw new IllegalStateException("Not supported"); } }
Example 20
Source File: MapReduceUtil.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
/** * @param cuboidScheduler specified can provide more flexibility * */ public static int getLayeredCubingReduceTaskNum(CubeSegment cubeSegment, CuboidScheduler cuboidScheduler, double totalMapInputMB, int level) throws ClassNotFoundException, IOException, InterruptedException, JobException { CubeDesc cubeDesc = cubeSegment.getCubeDesc(); KylinConfig kylinConfig = cubeDesc.getConfig(); double perReduceInputMB = kylinConfig.getDefaultHadoopJobReducerInputMB(); double reduceCountRatio = kylinConfig.getDefaultHadoopJobReducerCountRatio(); logger.info("Having per reduce MB " + perReduceInputMB + ", reduce count ratio " + reduceCountRatio + ", level " + level); CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, cuboidScheduler, kylinConfig); double parentLayerSizeEst, currentLayerSizeEst, adjustedCurrentLayerSizeEst; if (level == -1) { //merge case double estimatedSize = cubeStatsReader.estimateCubeSize(); adjustedCurrentLayerSizeEst = estimatedSize > totalMapInputMB ? totalMapInputMB : estimatedSize; logger.debug("estimated size {}, input size {}, adjustedCurrentLayerSizeEst: {}", estimatedSize, totalMapInputMB, adjustedCurrentLayerSizeEst); } else if (level == 0) { //base cuboid case TODO: the estimation could be very WRONG because it has no correction adjustedCurrentLayerSizeEst = cubeStatsReader.estimateLayerSize(0); logger.debug("adjustedCurrentLayerSizeEst: {}", adjustedCurrentLayerSizeEst); } else { parentLayerSizeEst = cubeStatsReader.estimateLayerSize(level - 1); currentLayerSizeEst = cubeStatsReader.estimateLayerSize(level); adjustedCurrentLayerSizeEst = totalMapInputMB / parentLayerSizeEst * currentLayerSizeEst; logger.debug( "totalMapInputMB: {}, parentLayerSizeEst: {}, currentLayerSizeEst: {}, adjustedCurrentLayerSizeEst: {}", totalMapInputMB, parentLayerSizeEst, currentLayerSizeEst, adjustedCurrentLayerSizeEst); } // number of reduce tasks int numReduceTasks = (int) Math.round(adjustedCurrentLayerSizeEst / perReduceInputMB * reduceCountRatio + 0.99); // adjust reducer number for cube which has DISTINCT_COUNT measures for better performance if (cubeDesc.hasMemoryHungryMeasures()) { logger.debug("Multiply reducer num by 4 to boost performance for memory hungry measures"); numReduceTasks = numReduceTasks * 4; } // at least 1 reducer by default numReduceTasks = Math.max(kylinConfig.getHadoopJobMinReducerNumber(), numReduceTasks); // no more than 500 reducer by default numReduceTasks = Math.min(kylinConfig.getHadoopJobMaxReducerNumber(), numReduceTasks); return numReduceTasks; }