org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#getParallelism

Source File: StreamingJobGraphGeneratorTest.java From flink with Apache License 2.0

6 votes

@Test
public void testOperatorCoordinatorAddedToJobVertex() {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	DataStream<Integer> stream = env.fromSource(
			new MockSource(Boundedness.BOUNDED, 1),
			WatermarkStrategy.noWatermarks(),
			"TestingSource");

	OneInputTransformation<Integer, Integer> resultTransform = new OneInputTransformation<Integer, Integer>(
			stream.getTransformation(),
			"AnyName",
			new CoordinatedTransformOperatorFactory(),
			BasicTypeInfo.INT_TYPE_INFO,
			env.getParallelism());

	new TestingSingleOutputStreamOperator<>(env, resultTransform).print();

	JobGraph jobGraph = StreamingJobGraphGenerator.createJobGraph(env.getStreamGraph());

	assertEquals(2, jobGraph.getVerticesAsArray()[0].getOperatorCoordinators().size());
}

Source File: DataStreamSource.java From Flink-CEPplus with Apache License 2.0

5 votes

public DataStreamSource(StreamExecutionEnvironment environment,
		TypeInformation<T> outTypeInfo, StreamSource<T, ?> operator,
		boolean isParallel, String sourceName) {
	super(environment, new SourceTransformation<>(sourceName, operator, outTypeInfo, environment.getParallelism()));

	this.isParallel = isParallel;
	if (!isParallel) {
		setParallelism(1);
	}
}

Source File: DataStreamSource.java From flink with Apache License 2.0

5 votes

public DataStreamSource(StreamExecutionEnvironment environment,
		TypeInformation<T> outTypeInfo, StreamSource<T, ?> operator,
		boolean isParallel, String sourceName) {
	super(environment, new SourceTransformation<>(sourceName, operator, outTypeInfo, environment.getParallelism()));

	this.isParallel = isParallel;
	if (!isParallel) {
		setParallelism(1);
	}
}

Source File: BroadcastTriangleCount.java From gelly-streaming with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

		// Set up the environment
		if(!parseParameters(args)) {
			return;
		}

		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		DataStream<Edge<Long, NullValue>> edges = getEdgesDataSet(env);

		int localSamples = samples / env.getParallelism();

		// Count triangles
		DataStream<Tuple2<Integer, Integer>> triangles = edges
				.broadcast()
				.flatMap(new TriangleSampler(localSamples, vertexCount))
				.flatMap(new TriangleSummer(samples, vertexCount))
				.setParallelism(1);

		// Emit the results
		if (fileOutput) {
			triangles.writeAsCsv(outputPath);
		} else {
			triangles.print();
		}

		env.execute("Broadcast Triangle Count");
	}

Source File: IncidenceSamplingTriangleCount.java From gelly-streaming with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

		// Set up the environment
		if(!parseParameters(args)) {
			return;
		}

		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		DataStream<Edge<Long, NullValue>> edges = getEdgesDataSet(env);

		int localSamples = samples / env.getParallelism();

		// Count triangles
		DataStream<Tuple2<Integer, Integer>> triangles = edges
				.flatMap(new EdgeSampleMapper(localSamples, env.getParallelism()))
				.setParallelism(1)
				.keyBy(0)
				.flatMap(new TriangleSampleMapper(localSamples, vertexCount))
				.flatMap(new TriangleSummer(samples, vertexCount))
				.setParallelism(1);

		// Emit the results
		if (fileOutput) {
			triangles.writeAsCsv(outputPath);
		} else {
			triangles.print();
		}

		env.execute("Incidence Sampling Triangle Count");
	}

Source File: HiveTableSource.java From flink with Apache License 2.0

5 votes

private DataStream<RowData> createStreamSourceForPartitionTable(
		StreamExecutionEnvironment execEnv,
		TypeInformation<RowData> typeInfo,
		HiveTableInputFormat inputFormat) {
	Configuration configuration = new Configuration();
	catalogTable.getOptions().forEach(configuration::setString);

	String consumeOrderStr = configuration.get(STREAMING_SOURCE_CONSUME_ORDER);
	ConsumeOrder consumeOrder = ConsumeOrder.getConsumeOrder(consumeOrderStr);
	String consumeOffset = configuration.get(STREAMING_SOURCE_CONSUME_START_OFFSET);
	String extractorKind = configuration.get(PARTITION_TIME_EXTRACTOR_KIND);
	String extractorClass = configuration.get(PARTITION_TIME_EXTRACTOR_CLASS);
	String extractorPattern = configuration.get(PARTITION_TIME_EXTRACTOR_TIMESTAMP_PATTERN);
	Duration monitorInterval = configuration.get(STREAMING_SOURCE_MONITOR_INTERVAL);

	HiveContinuousMonitoringFunction monitoringFunction = new HiveContinuousMonitoringFunction(
			hiveShim,
			jobConf,
			tablePath,
			catalogTable,
			execEnv.getParallelism(),
			consumeOrder,
			consumeOffset,
			extractorKind,
			extractorClass,
			extractorPattern,
			monitorInterval.toMillis());

	ContinuousFileReaderOperatorFactory<RowData, TimestampedHiveInputSplit> factory =
			new ContinuousFileReaderOperatorFactory<>(inputFormat);

	String sourceName = "HiveMonitoringFunction";
	SingleOutputStreamOperator<RowData> source = execEnv
			.addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, factory);

	return new DataStreamSource<>(source);
}

Source File: HiveTableSource.java From flink with Apache License 2.0

5 votes

private DataStream<RowData> createStreamSourceForNonPartitionTable(
		StreamExecutionEnvironment execEnv,
		TypeInformation<RowData> typeInfo,
		HiveTableInputFormat inputFormat,
		HiveTablePartition hiveTable) {
	HiveTableFileInputFormat fileInputFormat = new HiveTableFileInputFormat(inputFormat, hiveTable);

	Configuration configuration = new Configuration();
	catalogTable.getOptions().forEach(configuration::setString);
	String consumeOrderStr = configuration.get(STREAMING_SOURCE_CONSUME_ORDER);
	ConsumeOrder consumeOrder = ConsumeOrder.getConsumeOrder(consumeOrderStr);
	if (consumeOrder != ConsumeOrder.CREATE_TIME_ORDER) {
		throw new UnsupportedOperationException(
				"Only " + ConsumeOrder.CREATE_TIME_ORDER + " is supported for non partition table.");
	}

	String consumeOffset = configuration.get(STREAMING_SOURCE_CONSUME_START_OFFSET);
	// to Local zone mills instead of UTC mills
	long currentReadTime = TimestampData.fromLocalDateTime(toLocalDateTime(consumeOffset))
			.toTimestamp().getTime();

	Duration monitorInterval = configuration.get(STREAMING_SOURCE_MONITOR_INTERVAL);

	ContinuousFileMonitoringFunction<RowData> monitoringFunction =
			new ContinuousFileMonitoringFunction<>(
					fileInputFormat,
					FileProcessingMode.PROCESS_CONTINUOUSLY,
					execEnv.getParallelism(),
					monitorInterval.toMillis(),
					currentReadTime);

	ContinuousFileReaderOperatorFactory<RowData, TimestampedFileInputSplit> factory =
			new ContinuousFileReaderOperatorFactory<>(fileInputFormat);

	String sourceName = "HiveFileMonitoringFunction";
	SingleOutputStreamOperator<RowData> source = execEnv.addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, factory);

	return new DataStreamSource<>(source);
}

Source File: DataStreamSource.java From flink with Apache License 2.0

5 votes

/**
 * The constructor used to create legacy sources.
 */
public DataStreamSource(
		StreamExecutionEnvironment environment,
		TypeInformation<T> outTypeInfo,
		StreamSource<T, ?> operator,
		boolean isParallel,
		String sourceName) {
	super(environment, new LegacySourceTransformation<>(sourceName, operator, outTypeInfo, environment.getParallelism()));

	this.isParallel = isParallel;
	if (!isParallel) {
		setParallelism(1);
	}
}

Source File: DataStreamSource.java From flink with Apache License 2.0

5 votes

/**
 * Constructor for new Sources (FLIP-27).
 */
public DataStreamSource(
		StreamExecutionEnvironment environment,
		Source<T, ?, ?> source,
		WatermarkStrategy<T> timestampsAndWatermarks,
		TypeInformation<T> outTypeInfo,
		String sourceName) {
	super(environment,
			new SourceTransformation<>(
					sourceName,
					new SourceOperatorFactory<>(source, timestampsAndWatermarks),
					outTypeInfo,
					environment.getParallelism()));
}

Source File: ContinuousFileProcessingITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	Thread job = new Thread() {

		@Override
		public void run() {
			try {
				env.execute("ContinuousFileProcessingITCase Job.");
			} catch (Exception e) {
				Throwable th = e;
				for (int depth = 0; depth < 20; depth++) {
					if (th instanceof SuccessException) {
						return;
					} else if (th.getCause() != null) {
						th = th.getCause();
					} else {
						break;
					}
				}
				e.printStackTrace();
				Assert.fail(e.getMessage());
			}
		}
	};
	job.start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	// wait for the job to finish.
	job.join();
}

Source File: ContinuousFileProcessingITCase.java From flink with Apache License 2.0

4 votes

@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	Thread job = new Thread() {

		@Override
		public void run() {
			try {
				env.execute("ContinuousFileProcessingITCase Job.");
			} catch (Exception e) {
				Throwable th = e;
				for (int depth = 0; depth < 20; depth++) {
					if (th instanceof SuccessException) {
						return;
					} else if (th.getCause() != null) {
						th = th.getCause();
					} else {
						break;
					}
				}
				e.printStackTrace();
				Assert.fail(e.getMessage());
			}
		}
	};
	job.start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	// wait for the job to finish.
	job.join();
}

Source File: FlinkStreamingPortablePipelineTranslator.java From beam with Apache License 2.0

4 votes

private static <T> DataStream<WindowedValue<T>> translateUnboundedSource(
    String transformName,
    String outputCollectionId,
    RunnerApi.ReadPayload payload,
    RunnerApi.Pipeline pipeline,
    PipelineOptions pipelineOptions,
    StreamExecutionEnvironment env) {

  final DataStream<WindowedValue<T>> source;
  final DataStream<WindowedValue<ValueWithRecordId<T>>> nonDedupSource;
  Coder<WindowedValue<T>> windowCoder =
      instantiateCoder(outputCollectionId, pipeline.getComponents());

  TypeInformation<WindowedValue<T>> outputTypeInfo = new CoderTypeInformation<>(windowCoder);

  WindowingStrategy windowStrategy =
      getWindowingStrategy(outputCollectionId, pipeline.getComponents());
  TypeInformation<WindowedValue<ValueWithRecordId<T>>> withIdTypeInfo =
      new CoderTypeInformation<>(
          WindowedValue.getFullCoder(
              ValueWithRecordId.ValueWithRecordIdCoder.of(
                  ((WindowedValueCoder) windowCoder).getValueCoder()),
              windowStrategy.getWindowFn().windowCoder()));

  UnboundedSource unboundedSource = ReadTranslation.unboundedSourceFromProto(payload);

  try {
    int parallelism =
        env.getMaxParallelism() > 0 ? env.getMaxParallelism() : env.getParallelism();
    UnboundedSourceWrapper sourceWrapper =
        new UnboundedSourceWrapper<>(
            transformName, pipelineOptions, unboundedSource, parallelism);
    nonDedupSource =
        env.addSource(sourceWrapper)
            .name(transformName)
            .uid(transformName)
            .returns(withIdTypeInfo);

    if (unboundedSource.requiresDeduping()) {
      source =
          nonDedupSource
              .keyBy(new FlinkStreamingTransformTranslators.ValueWithRecordIdKeySelector<>())
              .transform("deduping", outputTypeInfo, new DedupingOperator<>(pipelineOptions))
              .uid(format("%s/__deduplicated__", transformName));
    } else {
      source =
          nonDedupSource
              .flatMap(new FlinkStreamingTransformTranslators.StripIdsMap<>(pipelineOptions))
              .returns(outputTypeInfo);
    }
  } catch (Exception e) {
    throw new RuntimeException("Error while translating UnboundedSource: " + unboundedSource, e);
  }

  return source;
}

Source File: ContinuousFileProcessingITCase.java From flink with Apache License 2.0

4 votes

@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, new ContinuousFileReaderOperatorFactory<>(format));
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	CompletableFuture<Void> jobFuture = new CompletableFuture<>();
	new Thread(() -> {
		try {
			env.execute("ContinuousFileProcessingITCase Job.");
			jobFuture.complete(null);
		} catch (Exception e) {
			if (ExceptionUtils.findThrowable(e, SuccessException.class).isPresent()) {
				jobFuture.complete(null);
			} else {
				jobFuture.completeExceptionally(e);
			}
		}
	}).start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	jobFuture.get();
}

Java Code Examples for org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#getParallelism()