org.apache.flink.api.java.operators.DataSource#map

Source File: PartitionITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test(expected = InvalidProgramException.class)
public void testRangePartitionInIteration() throws Exception {

	// does not apply for collection execution
	if (super.mode == TestExecutionMode.COLLECTION) {
		throw new InvalidProgramException("Does not apply for collection execution");
	}

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSource<Long> source = env.generateSequence(0, 10000);

	DataSet<Tuple2<Long, String>> tuples = source.map(new MapFunction<Long, Tuple2<Long, String>>() {
		@Override
		public Tuple2<Long, String> map(Long v) throws Exception {
			return new Tuple2<>(v, Long.toString(v));
		}
	});

	DeltaIteration<Tuple2<Long, String>, Tuple2<Long, String>> it = tuples.iterateDelta(tuples, 10, 0);
	DataSet<Tuple2<Long, String>> body = it.getWorkset()
		.partitionByRange(1) // Verify that range partition is not allowed in iteration
		.join(it.getSolutionSet())
		.where(0).equalTo(0).projectFirst(0).projectSecond(1);
	DataSet<Tuple2<Long, String>> result = it.closeWith(body, body);

	result.collect(); // should fail
}

Source File: PartitionITCase.java From flink with Apache License 2.0

5 votes

@Test(expected = InvalidProgramException.class)
public void testRangePartitionInIteration() throws Exception {

	// does not apply for collection execution
	if (super.mode == TestExecutionMode.COLLECTION) {
		throw new InvalidProgramException("Does not apply for collection execution");
	}

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSource<Long> source = env.generateSequence(0, 10000);

	DataSet<Tuple2<Long, String>> tuples = source.map(new MapFunction<Long, Tuple2<Long, String>>() {
		@Override
		public Tuple2<Long, String> map(Long v) throws Exception {
			return new Tuple2<>(v, Long.toString(v));
		}
	});

	DeltaIteration<Tuple2<Long, String>, Tuple2<Long, String>> it = tuples.iterateDelta(tuples, 10, 0);
	DataSet<Tuple2<Long, String>> body = it.getWorkset()
		.partitionByRange(1) // Verify that range partition is not allowed in iteration
		.join(it.getSolutionSet())
		.where(0).equalTo(0).projectFirst(0).projectSecond(1);
	DataSet<Tuple2<Long, String>> result = it.closeWith(body, body);

	result.collect(); // should fail
}

Source File: PartitionITCase.java From flink with Apache License 2.0

5 votes

@Test(expected = InvalidProgramException.class)
public void testRangePartitionInIteration() throws Exception {

	// does not apply for collection execution
	if (super.mode == TestExecutionMode.COLLECTION) {
		throw new InvalidProgramException("Does not apply for collection execution");
	}

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSource<Long> source = env.generateSequence(0, 10000);

	DataSet<Tuple2<Long, String>> tuples = source.map(new MapFunction<Long, Tuple2<Long, String>>() {
		@Override
		public Tuple2<Long, String> map(Long v) throws Exception {
			return new Tuple2<>(v, Long.toString(v));
		}
	});

	DeltaIteration<Tuple2<Long, String>, Tuple2<Long, String>> it = tuples.iterateDelta(tuples, 10, 0);
	DataSet<Tuple2<Long, String>> body = it.getWorkset()
		.partitionByRange(1) // Verify that range partition is not allowed in iteration
		.join(it.getSolutionSet())
		.where(0).equalTo(0).projectFirst(0).projectSecond(1);
	DataSet<Tuple2<Long, String>> result = it.closeWith(body, body);

	result.collect(); // should fail
}

Source File: FlinkMergingDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);
    final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    final Job job = Job.getInstance();

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }

    HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

    logger.info("Dictionary output path: {}", dictOutputPath);
    logger.info("Statistics output path: {}", statOutputPath);

    final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
    final int columnLength = tblColRefs.length;

    List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

    for (int i = 0; i <= columnLength; i++) {
        indexs.add(i);
    }

    DataSource<Integer> indexDS = env.fromCollection(indexs);

    DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName,
            metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

    FlinkUtil.setHadoopConfForCuboid(job, null, null);
    HadoopOutputFormat<Text, Text> hadoopOF =
            new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job);
    SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

    colToDictPathDS.output(hadoopOF).setParallelism(1);

    env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
}

Source File: FlinkMergingDictionary.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);
    final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    final Job job = Job.getInstance();

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }

    HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

    logger.info("Dictionary output path: {}", dictOutputPath);
    logger.info("Statistics output path: {}", statOutputPath);

    final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
    final int columnLength = tblColRefs.length;

    List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

    for (int i = 0; i <= columnLength; i++) {
        indexs.add(i);
    }

    DataSource<Integer> indexDS = env.fromCollection(indexs);

    DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName,
            metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

    FlinkUtil.setHadoopConfForCuboid(job, null, null);
    HadoopOutputFormat<Text, Text> hadoopOF =
            new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job);
    SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

    colToDictPathDS.output(hadoopOF).setParallelism(1);

    env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
}

Java Code Examples for org.apache.flink.api.java.operators.DataSource#map()