org.apache.flink.client.program.ClusterClient#submitJob

Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0

6 votes

private void restoreJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception {
	JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE);
	jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState));

	assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID());

	clusterClient.submitJob(jobToRestore, classLoader);

	CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay(
		() -> clusterClient.getJobStatus(jobToRestore.getJobID()),
		Time.milliseconds(50),
		deadline,
		(jobStatus) -> jobStatus == JobStatus.FINISHED,
		TestingUtils.defaultScheduledExecutor());
	assertEquals(
		JobStatus.FINISHED,
		jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

6 votes

private String submitJobAndTakeSavepoint(MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		StatefulCounter.getProgressLatch().await();

		return client.cancelWithSavepoint(jobId, null);
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: SavepointITCase.java From flink with Apache License 2.0

6 votes

private String submitJobAndTakeSavepoint(MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		StatefulCounter.getProgressLatch().await();

		return client.cancelWithSavepoint(jobId, null);
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: SavepointITCase.java From flink with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: NetworkStackThroughputITCase.java From flink with Apache License 2.0

5 votes

private void testProgram(
		final MiniClusterWithClientResource cluster,
		final int dataVolumeGb,
		final boolean useForwarder,
		final boolean isSlowSender,
		final boolean isSlowReceiver,
		final int parallelism) throws Exception {
	ClusterClient<?> client = cluster.getClusterClient();
	client.setDetached(false);
	client.setPrintStatusDuringExecution(false);

	JobExecutionResult jer = (JobExecutionResult) client.submitJob(
		createJobGraph(
			dataVolumeGb,
			useForwarder,
			isSlowSender,
			isSlowReceiver,
			parallelism),
		getClass().getClassLoader());

	long dataVolumeMbit = dataVolumeGb * 8192;
	long runtimeSecs = jer.getNetRuntime(TimeUnit.SECONDS);

	int mbitPerSecond = (int) (((double) dataVolumeMbit) / runtimeSecs);

	LOG.info(String.format("Test finished with throughput of %d MBit/s (runtime [secs]: %d, " +
		"data volume [gb/mbits]: %d/%d)", mbitPerSecond, runtimeSecs, dataVolumeGb, dataVolumeMbit));
}

Source File: SavepointWriterITCase.java From flink with Apache License 2.0

5 votes

private void validateBootstrap(String savepointPath) throws ProgramInvocationException {
	StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
	sEnv.setStateBackend(backend);

	CollectSink.accountList.clear();

	sEnv.fromCollection(accounts)
		.keyBy(acc -> acc.id)
		.flatMap(new UpdateAndGetAccount())
		.uid(ACCOUNT_UID)
		.addSink(new CollectSink());

	sEnv
		.fromCollection(currencyRates)
		.connect(sEnv.fromCollection(currencyRates).broadcast(descriptor))
		.process(new CurrencyValidationFunction())
		.uid(CURRENCY_UID)
		.addSink(new DiscardingSink<>());

	JobGraph jobGraph = sEnv.getStreamGraph().getJobGraph();
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false));

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.submitJob(jobGraph, SavepointWriterITCase.class.getClassLoader());

	Assert.assertEquals("Unexpected output", 3, CollectSink.accountList.size());
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: CancelingTestBase.java From flink with Apache License 2.0

5 votes

protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
	// submit job
	final JobGraph jobGraph = getJobGraph(plan);

	ClusterClient<?> client = CLUSTER.getClusterClient();
	client.setDetached(true);

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader());

	Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();

	JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatus != JobStatus.RUNNING) {
		Assert.fail("Job not in state RUNNING.");
	}

	Thread.sleep(msecsTillCanceling);

	client.cancel(jobSubmissionResult.getJobID());

	Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow();

	JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatusAfterCancel != JobStatus.CANCELED) {
		Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.');
	}
}

Source File: SavepointReaderKeyedStateITCase.java From flink with Apache License 2.0

5 votes

private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, getClass().getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}

Source File: NetworkStackThroughputITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void testProgram(
		final MiniClusterWithClientResource cluster,
		final int dataVolumeGb,
		final boolean useForwarder,
		final boolean isSlowSender,
		final boolean isSlowReceiver,
		final int parallelism) throws Exception {
	ClusterClient<?> client = cluster.getClusterClient();
	client.setDetached(false);
	client.setPrintStatusDuringExecution(false);

	JobExecutionResult jer = (JobExecutionResult) client.submitJob(
		createJobGraph(
			dataVolumeGb,
			useForwarder,
			isSlowSender,
			isSlowReceiver,
			parallelism),
		getClass().getClassLoader());

	long dataVolumeMbit = dataVolumeGb * 8192;
	long runtimeSecs = jer.getNetRuntime(TimeUnit.SECONDS);

	int mbitPerSecond = (int) (((double) dataVolumeMbit) / runtimeSecs);

	LOG.info(String.format("Test finished with throughput of %d MBit/s (runtime [secs]: %d, " +
		"data volume [gb/mbits]: %d/%d)", mbitPerSecond, runtimeSecs, dataVolumeGb, dataVolumeMbit));
}

Source File: SavepointITCase.java From flink with Apache License 2.0

4 votes

@Test
public void testSubmitWithUnknownSavepointPath() throws Exception {
	// Config
	int numTaskManagers = 1;
	int numSlotsPerTaskManager = 1;
	int parallelism = numTaskManagers * numSlotsPerTaskManager;

	final Configuration config = new Configuration();
	config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString());

	MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(numTaskManagers)
			.setNumberSlotsPerTaskManager(numSlotsPerTaskManager)
			.build());
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {

		// High value to ensure timeouts if restarted.
		int numberOfRetries = 1000;
		// Submit the job
		// Long delay to ensure that the test times out if the job
		// manager tries to restart the job.
		final JobGraph jobGraph = createJobGraph(parallelism, numberOfRetries, 3600000);

		// Set non-existing savepoint path
		jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath("unknown path"));
		assertEquals("unknown path", jobGraph.getSavepointRestoreSettings().getRestorePath());

		LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");

		try {
			client.setDetached(false);
			client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());
		} catch (Exception e) {
			Optional<JobExecutionException> expectedJobExecutionException = ExceptionUtils.findThrowable(e, JobExecutionException.class);
			Optional<FileNotFoundException> expectedFileNotFoundException = ExceptionUtils.findThrowable(e, FileNotFoundException.class);
			if (!(expectedJobExecutionException.isPresent() && expectedFileNotFoundException.isPresent())) {
				throw e;
			}
		}
	} finally {
		cluster.after();
	}
}

Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0

4 votes

private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!PATTERN_CANCEL_WITH_SAVEPOINT_TOLERATED_EXCEPTIONS.matcher(exceptionString).find()) {
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}

Source File: AccumulatorLiveITCase.java From flink with Apache License 2.0

4 votes

private static void submitJobAndVerifyResults(JobGraph jobGraph) throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofSeconds(30));

	final ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();

	final CheckedThread submissionThread = new CheckedThread() {
		@Override
		public void go() throws Exception {
			client.submitJob(jobGraph, AccumulatorLiveITCase.class.getClassLoader());
		}
	};

	submissionThread.start();

	try {
		NotifyingMapper.notifyLatch.await();

		FutureUtils.retrySuccessfulWithDelay(
			() -> {
				try {
					return CompletableFuture.completedFuture(client.getAccumulators(jobGraph.getJobID()));
				} catch (Exception e) {
					return FutureUtils.completedExceptionally(e);
				}
			},
			Time.milliseconds(20),
			deadline,
			accumulators -> accumulators.size() == 1
				&& accumulators.containsKey(ACCUMULATOR_NAME)
				&& (int) accumulators.get(ACCUMULATOR_NAME).getUnchecked() == NUM_ITERATIONS,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		NotifyingMapper.shutdownLatch.trigger();
	} finally {
		NotifyingMapper.shutdownLatch.trigger();

		// wait for the job to have terminated
		submissionThread.sync();
	}
}

Source File: JMXJobManagerMetricTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that metrics registered on the JobManager are actually accessible via JMX.
 */
@Test
public void testJobManagerJMXMetricAccess() throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));

	try {
		JobVertex sourceJobVertex = new JobVertex("Source");
		sourceJobVertex.setInvokableClass(BlockingInvokable.class);

		JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
		jobGraph.setSnapshotSettings(new JobCheckpointingSettings(
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			new CheckpointCoordinatorConfiguration(
				500,
				500,
				50,
				5,
				CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
				true),
			null));

		ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
		client.setDetached(true);
		client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader());

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobGraph.getJobID()),
			Time.milliseconds(10),
			deadline,
			status -> status == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
		Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null);
		Assert.assertEquals(1, nameSet.size());
		assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value"));

		BlockingInvokable.unblock();
	} finally {
		BlockingInvokable.unblock();
	}
}

Source File: JMXJobManagerMetricTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that metrics registered on the JobManager are actually accessible via JMX.
 */
@Test
public void testJobManagerJMXMetricAccess() throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));

	try {
		JobVertex sourceJobVertex = new JobVertex("Source");
		sourceJobVertex.setInvokableClass(BlockingInvokable.class);

		JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
		jobGraph.setSnapshotSettings(new JobCheckpointingSettings(
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			new CheckpointCoordinatorConfiguration(
				500,
				500,
				50,
				5,
				CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
				true,
				false,
				0),
			null));

		ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
		client.setDetached(true);
		client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader());

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobGraph.getJobID()),
			Time.milliseconds(10),
			deadline,
			status -> status == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
		Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null);
		Assert.assertEquals(1, nameSet.size());
		assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value"));

		BlockingInvokable.unblock();
	} finally {
		BlockingInvokable.unblock();
	}
}

Source File: SavepointMigrationTestBase.java From flink with Apache License 2.0

4 votes

@SafeVarargs
protected final void executeAndSavepoint(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {
		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID());

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> accumOpt = accumulators.get(acc.f0);
			if (accumOpt == null) {
				allDone = false;
				break;
			}

			Integer numFinished = (Integer) accumOpt.get();
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.equals(acc.f1)) {
				allDone = false;
				break;
			}
		}
		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}

	LOG.info("Triggering savepoint.");

	CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null);

	String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

	File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath());
	// savepoints were changed to be directories in Flink 1.3
	if (jobManagerSavepoint.isDirectory()) {
		FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath));
	} else {
		FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath));
	}
}

Source File: SavepointMigrationTestBase.java From Flink-CEPplus with Apache License 2.0

4 votes

@SafeVarargs
protected final void executeAndSavepoint(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {
		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID());

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> accumOpt = accumulators.get(acc.f0);
			if (accumOpt == null) {
				allDone = false;
				break;
			}

			Integer numFinished = (Integer) accumOpt.get();
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.equals(acc.f1)) {
				allDone = false;
				break;
			}
		}
		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}

	LOG.info("Triggering savepoint.");

	CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null);

	String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

	File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath());
	// savepoints were changed to be directories in Flink 1.3
	if (jobManagerSavepoint.isDirectory()) {
		FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath));
	} else {
		FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath));
	}
}

Source File: RescalingITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}

Source File: SavepointMigrationTestBase.java From flink with Apache License 2.0

4 votes

@SafeVarargs
protected final void restoreAndExecute(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {

		// try and get a job result, this will fail if the job already failed. Use this
		// to get out of this loop
		JobID jobId = jobSubmissionResult.getJobID();

		try {
			CompletableFuture<JobStatus> jobStatusFuture = client.getJobStatus(jobSubmissionResult.getJobID());

			JobStatus jobStatus = jobStatusFuture.get(5, TimeUnit.SECONDS);

			assertNotEquals(JobStatus.FAILED, jobStatus);
		} catch (Exception e) {
			fail("Could not connect to job: " + e);
		}

		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobId);

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> numFinished = accumulators.get(acc.f0);
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.get().equals(acc.f1)) {
				allDone = false;
				break;
			}
		}

		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}
}

Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0

4 votes

private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!(exceptionString.matches("(.*\n)*.*savepoint for the job .* failed(.*\n)*") // legacy
						|| exceptionString.matches("(.*\n)*.*was not running(.*\n)*")
						|| exceptionString.matches("(.*\n)*.*Not all required tasks are currently running(.*\n)*") // new
						|| exceptionString.matches("(.*\n)*.*Checkpoint was declined \\(tasks not ready\\)(.*\n)*"))) { // new
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}

Java Code Examples for org.apache.flink.client.program.ClusterClient#submitJob()