org.apache.kylin.cube.CubeManager#getCube

Source File: HybridCubeCLITest.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Test
public void testSegmentOverlap() throws IOException {
    thrown.expect(RuntimeException.class);
    thrown.expectMessage("Segments has overlap");

    HybridManager hybridManager = HybridManager.getInstance(KylinConfig.getInstanceFromEnv());
    Assert.assertNull(hybridManager.getHybridInstance("ssb_hybrid"));
    HybridCubeCLI.main(new String[] { "-name", "ssb_hybrid", "-project", "default", "-model", "ssb", "-cubes", "ssb_cube1,ssb_cube2", "-action", "create" });

    CubeManager cubeManager = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
    CubeInstance cube1 = cubeManager.getCube("ssb_cube1");
    CubeInstance cube2 = cubeManager.getCube("ssb_cube2");

    // 2012-01-01,2012-01-03
    cubeManager.appendSegment(cube1, new SegmentRange.TSRange(1325376000000L, 1325548800000L));
    // 2012-01-02,2012-01-04
    cubeManager.appendSegment(cube2, new SegmentRange.TSRange(1325462400000L, 1325635200000L));

    HybridCubeCLI.main(new String[] { "-name", "ssb_hybrid", "-project", "default", "-model", "ssb", "-cubes", "ssb_cube1,ssb_cube2", "-action", "update" });
}

Source File: ColumnarSplitReader.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    if (!(split instanceof FileSplit)) {
        throw new IllegalArgumentException("Only compatible with FileSplits.");
    } else {
        logger.debug("CFG_Cube_Name: " + BatchConstants.CFG_CUBE_NAME);
        cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase(Locale.ROOT);
        segmentName = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_NAME).toUpperCase(Locale.ROOT);

        KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
        CubeManager cubeManager = CubeManager.getInstance(config);
        cube = cubeManager.getCube(cubeName);
        cubeDesc = cube.getDescriptor();
        cubeSegment = cube.getSegment(segmentName, SegmentStatusEnum.NEW);
    }
}

Source File: DeployCoprocessorCLI.java From kylin with Apache License 2.0

6 votes

private static List<String> filterByProjects(List<String> allTableNames, List<String> projectNames) {
    ProjectManager projectManager = ProjectManager.getInstance(KylinConfig.getInstanceFromEnv());
    CubeManager cubeManager = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());

    List<String> result = Lists.newArrayList();
    for (String p : projectNames) {
        p = p.trim();
        if (p.endsWith(",")) {
            p = p.substring(0, p.length() - 1);
        }

        ProjectInstance projectInstance = projectManager.getProject(p);
        List<RealizationEntry> cubeList = projectInstance.getRealizationEntries(RealizationType.CUBE);
        for (RealizationEntry cube : cubeList) {
            CubeInstance cubeInstance = cubeManager.getCube(cube.getRealization());
            for (CubeSegment segment : cubeInstance.getSegments()) {
                String tableName = segment.getStorageLocationIdentifier();
                if (allTableNames.contains(tableName)) {
                    result.add(tableName);
                }
            }
        }
    }
    return result;
}

Source File: MergeCuboidMapper.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    super.bindCurrentConfiguration(context.getConfiguration());

    String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME);
    String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

    CubeManager cubeManager = CubeManager.getInstance(config);
    CubeInstance cube = cubeManager.getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    CubeSegment mergedCubeSegment = cube.getSegmentById(segmentID);

    // decide which source segment
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(mergedCubeSegment)
            .getOutputFormat();
    CubeSegment sourceCubeSegment = outputFormat.findSourceSegment(fileSplit, cube);
    reEncoder = new SegmentReEncoder(cubeDesc, sourceCubeSegment, mergedCubeSegment, config);
}

Source File: UpdateCubeInfoAfterCheckpointStep.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager cubeManager = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = cubeManager.getCube(CubingExecutableUtil.getCubeName(this.getParams()));

    Set<Long> recommendCuboids = cube.getCuboidsRecommend();
    try {
        List<CubeSegment> newSegments = cube.getSegments(SegmentStatusEnum.READY_PENDING);
        Map<Long, Long> recommendCuboidsWithStats = CuboidStatsReaderUtil
                .readCuboidStatsFromSegments(recommendCuboids, newSegments);
        if (recommendCuboidsWithStats == null) {
            throw new RuntimeException("Fail to get statistics info for recommended cuboids after optimization!!!");
        }
        cubeManager.promoteCheckpointOptimizeSegments(cube, recommendCuboidsWithStats,
                newSegments.toArray(new CubeSegment[newSegments.size()]));
        return new ExecuteResult();
    } catch (Exception e) {
        logger.error("fail to update cube after build", e);
        return ExecuteResult.createError(e);
    }
}

Source File: BaseCuboidMapperTest.java From Kylin with Apache License 2.0

5 votes

@Test
public void testMapperWithNull() throws Exception {
    String cubeName = "test_kylin_cube_with_slr_1_new_segment";
    String segmentName = "20130331080000_20131212080000";
    mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
    mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);
    // mapDriver.getConfiguration().set(BatchConstants.CFG_METADATA_URL,
    // metadata);
    mapDriver.withInput(new Text("key"), new Text("2012-12-15118480Health & BeautyFragrances\\NAuction15123456789\\N"));
    List<Pair<Text, Text>> result = mapDriver.run();

    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
    CubeInstance cube = cubeMgr.getCube(cubeName);

    assertEquals(1, result.size());
    Text rowkey = result.get(0).getFirst();
    byte[] key = rowkey.getBytes();
    byte[] header = Bytes.head(key, 26);
    byte[] sellerId = Bytes.tail(header, 18);
    byte[] cuboidId = Bytes.head(header, 8);
    byte[] restKey = Bytes.tail(key, rowkey.getLength() - 26);

    RowKeyDecoder decoder = new RowKeyDecoder(cube.getFirstSegment());
    decoder.decode(key);
    assertEquals("[123456789, 2012-12-15, 11848, Health & Beauty, Fragrances, null, Auction, 0, 15]", decoder.getValues().toString());

    assertTrue(Bytes.toString(sellerId).startsWith("123456789"));
    assertEquals(511, Bytes.toLong(cuboidId));
    assertEquals(22, restKey.length);

    verifyMeasures(cube.getDescriptor().getMeasures(), result.get(0).getSecond(), "0", "0", "0");
}

Source File: ProjectManagerTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Test
public void testProjectsDrop() throws IOException {
    ProjectManager prjMgr = ProjectManager.getInstance(getTestConfig());
    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());

    CubeInstance cube = cubeMgr.getCube("ci_left_join_cube");
    assertTrue(prjMgr.getRealizationsByTable("default", "default.test_kylin_fact").contains(cube));
    assertTrue(prjMgr.listAllRealizations("default").contains(cube));

    cubeMgr.dropCube(cube.getName(), false);

    assertTrue(!prjMgr.getRealizationsByTable("default", "default.test_kylin_fact").contains(cube));
    assertTrue(!prjMgr.listAllRealizations("default").contains(cube));
}

Source File: ColumnToRowJob.java From kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);

        parseOptions(options, args);
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentName = getOptionValue(OPTION_SEGMENT_NAME);

        KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        CubeManager cubeMgr = CubeManager.getInstance(kylinConfig);
        CubeInstance cube = cubeMgr.getCube(cubeName);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        setJobClasspath(job, cube.getConfig());
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);

        job.setMapperClass(ColumnToRowMapper.class);
        job.setInputFormatClass(ColumnarSplitDataInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ColumnToRowReducer.class);
        job.setNumReduceTasks(calReducerNum(input));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.getConfiguration().set("dfs.block.size", cube.getConfig().getStreamingBasicCuboidJobDFSBlockSize());
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);

        CubeSegment segment = cube.getSegment(segmentName, SegmentStatusEnum.NEW);
        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        logger.error("error in CuboidJob", e);
        printUsage(options);
        throw e;
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: LocalWithSparkSessionTest.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

protected void cleanupSegments(String cubeName) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    CubeManager cubeMgr = CubeManager.getInstance(config);
    CubeInstance cube = cubeMgr.getCube(cubeName);
    cubeMgr.updateCubeDropSegments(cube, cube.getSegments());
}

Source File: SignatureCalculatorTest.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Test
public void testRealizationSetCalculator() throws IOException {
    KylinConfig config = KylinConfig.createKylinConfig(getTestConfig());
    Map<String, String> overrides = Maps.newHashMap();
    overrides.put("kylin.query.signature-class", "org.apache.kylin.rest.signature.RealizationSetCalculator");

    ProjectInstance projectInstance = ProjectManager.getInstance(config).getProject(projectName);
    projectInstance.setConfig(KylinConfigExt.createInstance(config, overrides));

    HybridManager hybridManager = HybridManager.getInstance(config);
    HybridInstance hybrid1 = hybridManager.getHybridInstance("test_kylin_hybrid_ready");

    CubeManager cubeManager = CubeManager.getInstance(config);
    CubeInstance cube1 = cubeManager.getCube("test_kylin_cube_with_slr_ready_2_segments");
    CubeInstance cube2 = cubeManager.getCube("test_kylin_cube_without_slr_ready");
    CubeInstance cube2Clone = cloneCubeInstance(cubeManager, cube2, cube2.getName() + "_clone");

    //Related cubes:
    // - test_kylin_cube_with_slr_ready
    // - test_kylin_cube_with_slr_ready_2_segments
    // - test_kylin_cube_without_slr_ready
    String cubes = hybrid1.getCanonicalName() + "," + cube2Clone.getCanonicalName();

    SQLResponse sqlResponse = new SQLResponse();
    sqlResponse.setCube(cubes);

    String signature = SQLResponseSignatureUtil.createSignature(config, sqlResponse, projectName);
    sqlResponse.setSignature(signature);

    Assert.assertTrue(SQLResponseSignatureUtil.checkSignature(config, sqlResponse, projectName));

    {//Test the influence of related cubes status change
        cube1 = cubeManager.updateCubeStatus(cube1, RealizationStatusEnum.DISABLED);
        Assert.assertFalse(SQLResponseSignatureUtil.checkSignature(config, sqlResponse, projectName));

        cube1 = cubeManager.updateCubeStatus(cube1, RealizationStatusEnum.READY);
        Assert.assertTrue(SQLResponseSignatureUtil.checkSignature(config, sqlResponse, projectName));
    }

    {//Test the influence of segment changes
        cube2Clone = cubeManager.updateCubeDropSegments(cube2Clone, cube2Clone.getSegments().get(0));
        Assert.assertFalse(SQLResponseSignatureUtil.checkSignature(config, sqlResponse, projectName));
    }
}

Source File: BulkLoadJob.java From Kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_HTABLE_NAME);
        options.addOption(OPTION_CUBE_NAME);
        parseOptions(options, args);

        String tableName = getOptionValue(OPTION_HTABLE_NAME).toUpperCase();
        // e.g
        // /tmp/kylin-3f150b00-3332-41ca-9d3d-652f67f044d7/test_kylin_cube_with_slr_ready_2_segments/hfile/
        // end with "/"
        String input = getOptionValue(OPTION_INPUT_PATH);

        Configuration conf = HBaseConfiguration.create(getConf());
        FileSystem fs = FileSystem.get(conf);

        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        KylinConfig config = KylinConfig.getInstanceFromEnv();
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        FsPermission permission = new FsPermission((short) 0777);
        for (HBaseColumnFamilyDesc cf : cubeDesc.getHBaseMapping().getColumnFamily()) {
            String cfName = cf.getName();
            fs.setPermission(new Path(input + cfName), permission);
        }

        String[] newArgs = new String[2];
        newArgs[0] = input;
        newArgs[1] = tableName;

        log.debug("Start to run LoadIncrementalHFiles");
        int ret = ToolRunner.run(new LoadIncrementalHFiles(conf), newArgs);
        log.debug("End to run LoadIncrementalHFiles");
        return ret;
    } catch (Exception e) {
        printUsage(options);
        throw e;
    }
}

Source File: MergeStatisticsWithOldStep.java From kylin with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager mgr = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));

    CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment);
    Preconditions.checkNotNull(oldSegment,
            "cannot find the original segment to be optimized by " + optimizeSegment);

    KylinConfig kylinConf = cube.getConfig();
    Configuration conf = HadoopUtil.getCurrentConfiguration();
    ResourceStore rs = ResourceStore.getStore(kylinConf);
    int averageSamplingPercentage = 0;

    try {
        //1. Add statistics from optimized segment
        Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        FileSystem hdfs = FileSystem.get(conf);
        if (!hdfs.exists(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists");
        }

        if (!hdfs.isDirectory(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory");
        }

        Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath);
        }

        for (Path item : statisticsFiles) {
            CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null,
                    optimizeSegment.getConfig(), item);
            averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage();
            addFromCubeStatsReader(optimizeSegmentStatsReader);
        }

        //2. Add statistics from old segment
        CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig());
        averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage();
        addFromCubeStatsReader(oldSegmentStatsReader);

        logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString());
        //3. Store merged statistics for recommend cuboids
        averageSamplingPercentage = averageSamplingPercentage / 2;
        Set<Long> cuboidsRecommend = cube.getCuboidsRecommend();

        Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size());
        for (Long cuboid : cuboidsRecommend) {
            HLLCounter hll = cuboidHLLMap.get(cuboid);
            if (hll == null) {
                logger.warn("Cannot get the row count stats for cuboid " + cuboid);
            } else {
                resultCuboidHLLMap.put(cuboid, hll);
            }
        }

        String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams());
        CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap,
                averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount());

        try (FSDataInputStream mergedStats = hdfs
                .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) {
            // put the statistics to metadata store
            String statisticsFileName = optimizeSegment.getStatisticsResourcePath();
            rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis());
        }

        //By default, the cube optimization will use in-memory cubing
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment);

        return new ExecuteResult();
    } catch (IOException e) {
        logger.error("fail to merge cuboid statistics", e);
        return ExecuteResult.createError(e);
    }

}

Source File: FilterRecommendCuboidDataJob.java From kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment optSegment = cube.getSegmentById(segmentID);
        CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment);

        logger.info("Starting: " + job.getJobName());

        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(FilterRecommendCuboidDataMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Input
        job.setInputFormatClass(SequenceFileInputFormat.class);
        FileInputFormat.setInputPaths(job, input);

        // Reducer
        ConvergeCuboidDataUtil.setupReducer(job, originalSegment, output);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
        // add metadata to distributed cache
        attachSegmentMetadata(originalSegment, job.getConfiguration(), false, false);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        logger.error("error in CuboidJob", e);
        printUsage(options);
        throw e;
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: UpdateDictionaryStep.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager cubeMgr = CubeManager.getInstance(context.getConfig());
    final DictionaryManager dictMgrHdfs;
    final DictionaryManager dictMgrHbase;
    final CubeInstance cube = cubeMgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment newSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));
    final List<CubeSegment> mergingSegments = getMergingSegments(cube);
    final String dictInfoPath = this.getParams().get(BatchConstants.ARG_DICT_PATH);
    final String metadataUrl = this.getParams().get(BatchConstants.ARG_META_URL);

    final KylinConfig kylinConfHbase = cube.getConfig();
    final KylinConfig kylinConfHdfs = AbstractHadoopJob.loadKylinConfigFromHdfs(metadataUrl);

    Collections.sort(mergingSegments);

    try {
        Configuration conf = HadoopUtil.getCurrentConfiguration();
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        ResourceStore hbaseRS = ResourceStore.getStore(kylinConfHbase);
        ResourceStore hdfsRS = ResourceStore.getStore(kylinConfHdfs);
        dictMgrHdfs = DictionaryManager.getInstance(kylinConfHdfs);
        dictMgrHbase = DictionaryManager.getInstance(kylinConfHbase);

        // work on copy instead of cached objects
        CubeInstance cubeCopy = cube.latestCopyForWrite();
        CubeSegment newSegCopy = cubeCopy.getSegmentById(newSegment.getUuid());

        // update cube segment dictionary

        FileStatus[] fileStatuss = fs.listStatus(new Path(dictInfoPath), new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.getName().startsWith("part") || path.getName().startsWith("tmp");
            }
        });

        for (FileStatus fileStatus : fileStatuss) {
            Path filePath = fileStatus.getPath();

            SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, conf);
            Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Text value = (Text) ReflectionUtils.newInstance(reader.getValueClass(), conf);

            while (reader.next(key, value)) {
                String tblCol = key.toString();
                String dictInfoResource = value.toString();

                if (StringUtils.isNotEmpty(dictInfoResource)) {
                    logger.info(dictInfoResource);
                    // put dictionary file to metadata store
                    DictionaryInfo dictInfoHdfs = dictMgrHdfs.getDictionaryInfo(dictInfoResource);
                    DictionaryInfo dicInfoHbase = dictMgrHbase.trySaveNewDict(dictInfoHdfs.getDictionaryObject(), dictInfoHdfs);

                    if (dicInfoHbase != null){
                        TblColRef tblColRef = cube.getDescriptor().findColumnRef(tblCol.split(":")[0], tblCol.split(":")[1]);
                        newSegCopy.putDictResPath(tblColRef, dicInfoHbase.getResourcePath());
                    }
                }
            }

            IOUtils.closeStream(reader);
        }

        CubeSegment lastSeg = mergingSegments.get(mergingSegments.size() - 1);
        for (Map.Entry<String, String> entry : lastSeg.getSnapshots().entrySet()) {
            newSegCopy.putSnapshotResPath(entry.getKey(), entry.getValue());
        }

        // update statistics
        // put the statistics to metadata store
        String statisticsFileName = newSegment.getStatisticsResourcePath();
        hbaseRS.putResource(statisticsFileName, hdfsRS.getResource(newSegment.getStatisticsResourcePath()).content(), System.currentTimeMillis());

        CubeUpdate update = new CubeUpdate(cubeCopy);
        update.setToUpdateSegs(newSegCopy);
        cubeMgr.updateCube(update);

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to merge dictionary", e);
        return ExecuteResult.createError(e);
    }
}

Source File: CubeHFileJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_PARTITION_FILE_PATH);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_HTABLE_NAME);
        parseOptions(options, args);

        Path partitionFilePath = new Path(getOptionValue(OPTION_PARTITION_FILE_PATH));

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        String cubeName = getOptionValue(OPTION_CUBE_NAME);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());

        CubeInstance cube = cubeMgr.getCube(cubeName);

        // use current hbase configuration
        Configuration configuration = new Configuration(HBaseConnection.getCurrentHBaseConfiguration());
        String[] allServices = getAllServices(configuration);
        merge(configuration, getConf());
        configuration.setStrings(DFSConfigKeys.DFS_NAMESERVICES, allServices);

        job = Job.getInstance(configuration, getOptionValue(OPTION_JOB_NAME));

        setJobClasspath(job, cube.getConfig());

        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);
        FileOutputFormat.setOutputPath(job, output);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        // add metadata to distributed cache
        attachCubeMetadata(cube, job.getConfiguration());

        HTable htable = new HTable(configuration, getOptionValue(OPTION_HTABLE_NAME));

        // Automatic config !
        HFileOutputFormat3.configureIncrementalLoad(job, htable);
        reconfigurePartitions(configuration, partitionFilePath);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(CubeHFileMapper.class);
        job.setReducerClass(KeyValueReducer.class);
        job.setMapOutputKeyClass(RowKeyWritable.class);
        job.setMapOutputValueClass(KeyValue.class);
        job.setSortComparatorClass(RowKeyWritable.RowKeyComparator.class);

        // set block replication to 3 for hfiles
        configuration.set(DFSConfigKeys.DFS_REPLICATION_KEY, "3");

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: MergeDictJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        String jobName = getOptionValue(OPTION_JOB_NAME);
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentName = getOptionValue(OPTION_SEGMENT_NAME);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegment(segmentName, SegmentStatusEnum.NEW);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        setJobClasspath(job, cube.getConfig());
        job.setJobName(jobName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);

        logger.info("MergeDictReducer output path: {}", output);

        // Mapper
        job.setMapperClass(MergeDictMapper.class);
        job.setInputFormatClass(ColumnarSplitDictInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        //Reducer
        job.setReducerClass(MergeDictReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        attachCubeMetadata(cube, job.getConfiguration());

        deletePath(job.getConfiguration(), output);
        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        logger.error("job {} failed. ", job.getJobName(), e);
        throw e;
    }
}

Source File: UHCDictionaryJob.java From kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_CUBING_JOB_ID);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_INPUT_PATH);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));

        //add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        attachCubeMetadata(cube, job.getConfiguration());

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();

        //Note! handle uhc columns is null.
        boolean hasUHCValue = false;
        for (TblColRef tblColRef : uhcColumns) {
            Path path = new Path(input.toString() + "/" + tblColRef.getIdentity());
            if (HadoopUtil.getFileSystem(path).exists(path)) {
                FileInputFormat.addInputPath(job, path);
                FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class);
                hasUHCValue = true;
            }
        }

        if (!hasUHCValue) {
            isSkipped = true;
            return 0;
        }

        setJobClasspath(job, cube.getConfig());
        setupMapper();
        setupReducer(output, reducerCount);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);
        job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory());
        job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false");

        //8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice
        job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500");
        job.getConfiguration().set("mapred.reduce.child.java.opts", "-Xmx8g");
        //Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it)
        //Waiting the global dict lock maybe also take a long time.
        //So we set 8 hours here
        job.getConfiguration().set("mapreduce.task.timeout", "28800000");

        //allow user specially set config for uhc step
        for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) {
            job.getConfiguration().set(entry.getKey(), entry.getValue());
        }

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: RangeKeyDistributionJob.java From Kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);

        parseOptions(options, args);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job);

        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);
        // job.getConfiguration().set("dfs.block.size", "67108864");

        // Mapper
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(RangeKeyDistributionMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // Reducer - only one
        job.setReducerClass(RangeKeyDistributionReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setNumReduceTasks(1);

        this.deletePath(job.getConfiguration(), output);

        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        RealizationCapacity realizationCapacity = cube.getDescriptor().getModel().getCapacity();
        job.getConfiguration().set(BatchConstants.CUBE_CAPACITY, realizationCapacity.toString());

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        throw e;
    }
}

Source File: AbstractExecutable.java From kylin with Apache License 2.0

4 votes

public KylinConfig getCubeSpecificConfig() {
    String cubeName = getCubeName();
    CubeManager manager = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
    CubeInstance cube = manager.getCube(cubeName);
    return cube.getConfig();
}

Source File: KafkaFlatTableJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_SEGMENT_ID);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));

        String segmentId = getOptionValue(OPTION_SEGMENT_ID);

        // ----------------------------------------------------------------------------
        // add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentId);
        logger.info("Starting: " + job.getJobName());

        setJobClasspath(job, cube.getConfig());

        KafkaConfigManager kafkaConfigManager = KafkaConfigManager.getInstance(KylinConfig.getInstanceFromEnv());
        KafkaConfig kafkaConfig = kafkaConfigManager.getKafkaConfig(cube.getRootFactTable());
        String brokers = KafkaClient.getKafkaBrokers(kafkaConfig);
        String topic = kafkaConfig.getTopic();

        if (brokers == null || brokers.length() == 0 || topic == null) {
            throw new IllegalArgumentException("Invalid Kafka information, brokers " + brokers + ", topic " + topic);
        }

        JobEngineConfig jobEngineConfig = new JobEngineConfig(KylinConfig.getInstanceFromEnv());
        job.getConfiguration().addResource(new Path(jobEngineConfig.getHadoopJobConfFilePath(null)));
        KafkaConsumerProperties kafkaConsumerProperties = KafkaConsumerProperties.getInstanceFromEnv();
        job.getConfiguration().addResource(new Path(kafkaConsumerProperties.getKafkaConsumerHadoopJobConf()));
        job.getConfiguration().set(CONFIG_KAFKA_BROKERS, brokers);
        job.getConfiguration().set(CONFIG_KAFKA_TOPIC, topic);
        job.getConfiguration().set(CONFIG_KAFKA_TIMEOUT, String.valueOf(kafkaConfig.getTimeout()));
        job.getConfiguration().set(CONFIG_KAFKA_INPUT_FORMAT, "json");
        job.getConfiguration().set(CONFIG_KAFKA_PARSER_NAME, kafkaConfig.getParserName());
        job.getConfiguration().set(CONFIG_KAFKA_SPLIT_ROWS, String.valueOf(kafkaConfig.getSplitRows()));
        job.getConfiguration().set(CONFIG_KAFKA_CONSUMER_GROUP, cubeName); // use cubeName as consumer group name
        appendKafkaOverrideProperties(cube.getConfig(), job.getConfiguration());
        setupMapper(cube.getSegmentById(segmentId));
        job.setNumReduceTasks(0);
        FileOutputFormat.setOutputPath(job, output);
        FileOutputFormat.setCompressOutput(job, true);
        org.apache.log4j.Logger.getRootLogger().info("Output hdfs location: " + output);
        org.apache.log4j.Logger.getRootLogger().info("Output hdfs compression: " + true);
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

        attachCubeMetadata(cube, job.getConfiguration());
        deletePath(job.getConfiguration(), output);
        return waitForCompletion(job);

    } catch (Exception e) {
        logger.error("error in KafkaFlatTableJob", e);
        printUsage(options);
        throw e;
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }

}

Java Code Examples for org.apache.kylin.cube.CubeManager#getCube()