org.apache.flink.client.program.ClusterClient#cancel

Source File: SavepointTestBase.java From flink with Apache License 2.0

6 votes

public <T> String takeSavepoint(Collection<T> data, Function<SourceFunction<T>, StreamExecutionEnvironment> jobGraphFactory) throws Exception {

		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.getConfig().disableClosureCleaner();

		WaitingSource<T> waitingSource = createSource(data);

		JobGraph jobGraph = jobGraphFactory.apply(waitingSource).getStreamGraph().getJobGraph();
		JobID jobId = jobGraph.getJobID();

		ClusterClient<?> client = miniClusterResource.getClusterClient();

		try {
			JobSubmissionResult result = ClientUtils.submitJob(client, jobGraph);

			return CompletableFuture
				.runAsync(waitingSource::awaitSource)
				.thenCompose(ignore -> triggerSavepoint(client, result.getJobID()))
				.get(5, TimeUnit.MINUTES);
		} catch (Exception e) {
			throw new RuntimeException("Failed to take savepoint", e);
		} finally {
			client.cancel(jobId);
		}
	}

Source File: CancelingTestBase.java From Flink-CEPplus with Apache License 2.0

5 votes

protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
	// submit job
	final JobGraph jobGraph = getJobGraph(plan);

	ClusterClient<?> client = CLUSTER.getClusterClient();
	client.setDetached(true);

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader());

	Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();

	JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatus != JobStatus.RUNNING) {
		Assert.fail("Job not in state RUNNING.");
	}

	Thread.sleep(msecsTillCanceling);

	client.cancel(jobSubmissionResult.getJobID());

	Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow();

	JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatusAfterCancel != JobStatus.CANCELED) {
		Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.');
	}
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: ResumeCheckpointManuallyITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private static String runJobAndGetExternalizedCheckpoint(StateBackend backend, File checkpointDir, @Nullable String externalCheckpoint, ClusterClient<?> client) throws Exception {
	JobGraph initialJobGraph = getJobGraph(backend, externalCheckpoint);
	NotifyingInfiniteTupleSource.countDownLatch = new CountDownLatch(PARALLELISM);

	client.submitJob(initialJobGraph, ResumeCheckpointManuallyITCase.class.getClassLoader());

	// wait until all sources have been started
	NotifyingInfiniteTupleSource.countDownLatch.await();

	waitUntilExternalizedCheckpointCreated(checkpointDir, initialJobGraph.getJobID());
	client.cancel(initialJobGraph.getJobID());
	waitUntilCanceled(initialJobGraph.getJobID(), client);

	return getExternalizedCheckpointCheckpointPath(checkpointDir, initialJobGraph.getJobID());
}

Source File: CancelingTestBase.java From flink with Apache License 2.0

5 votes

protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
	// submit job
	final JobGraph jobGraph = getJobGraph(plan);

	ClusterClient<?> client = CLUSTER.getClusterClient();
	client.setDetached(true);

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader());

	Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();

	JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatus != JobStatus.RUNNING) {
		Assert.fail("Job not in state RUNNING.");
	}

	Thread.sleep(msecsTillCanceling);

	client.cancel(jobSubmissionResult.getJobID());

	Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow();

	JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatusAfterCancel != JobStatus.CANCELED) {
		Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.');
	}
}

Source File: SavepointITCase.java From flink with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: ResumeCheckpointManuallyITCase.java From flink with Apache License 2.0

5 votes

private static String runJobAndGetExternalizedCheckpoint(StateBackend backend, File checkpointDir, @Nullable String externalCheckpoint, ClusterClient<?> client) throws Exception {
	JobGraph initialJobGraph = getJobGraph(backend, externalCheckpoint);
	NotifyingInfiniteTupleSource.countDownLatch = new CountDownLatch(PARALLELISM);

	client.submitJob(initialJobGraph, ResumeCheckpointManuallyITCase.class.getClassLoader());

	// wait until all sources have been started
	NotifyingInfiniteTupleSource.countDownLatch.await();

	waitUntilExternalizedCheckpointCreated(checkpointDir, initialJobGraph.getJobID());
	client.cancel(initialJobGraph.getJobID());
	waitUntilCanceled(initialJobGraph.getJobID(), client);

	return getExternalizedCheckpointCheckpointPath(checkpointDir, initialJobGraph.getJobID());
}

Source File: SavepointReaderKeyedStateITCase.java From flink with Apache License 2.0

5 votes

private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, getClass().getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}

Source File: AbstractFlinkClient.java From alchemy with Apache License 2.0

5 votes

public SavepointResponse cancel(ClusterClient clusterClient, CancelFlinkRequest request) throws Exception {
    if (StringUtils.isEmpty(request.getJobID())) {
        return new SavepointResponse("the job is not submit yet");
    }
    boolean savePoint = request.getSavePoint() != null && request.getSavePoint().booleanValue();
    if (savePoint) {
        String path = clusterClient.cancelWithSavepoint(JobID.fromHexString(request.getJobID()),
            request.getSavepointDirectory());
        return new SavepointResponse(true, path);
    } else {
        clusterClient.cancel(JobID.fromHexString(request.getJobID()));
        return new SavepointResponse(true);
    }
}

Source File: ClassLoaderITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests disposal of a savepoint, which contains custom user code KvState.
 */
@Test
public void testDisposeSavepointWithCustomKvState() throws Exception {
	ClusterClient<?> clusterClient = new MiniClusterClient(new Configuration(), miniClusterResource.getMiniCluster());

	Deadline deadline = new FiniteDuration(100, TimeUnit.SECONDS).fromNow();

	File checkpointDir = FOLDER.newFolder();
	File outputDir = FOLDER.newFolder();

	final PackagedProgram program = new PackagedProgram(
			new File(CUSTOM_KV_STATE_JAR_PATH),
			new String[] {
					String.valueOf(parallelism),
					checkpointDir.toURI().toString(),
					"5000",
					outputDir.toURI().toString()
			});

	TestStreamEnvironment.setAsContext(
		miniClusterResource.getMiniCluster(),
		parallelism,
		Collections.singleton(new Path(CUSTOM_KV_STATE_JAR_PATH)),
		Collections.<URL>emptyList()
	);

	// Execute detached
	Thread invokeThread = new Thread(new Runnable() {
		@Override
		public void run() {
			try {
				program.invokeInteractiveModeForExecution();
			} catch (ProgramInvocationException ignored) {
				if (ignored.getCause() == null ||
					!(ignored.getCause() instanceof JobCancellationException)) {
					ignored.printStackTrace();
				}
			}
		}
	});

	LOG.info("Starting program invoke thread");
	invokeThread.start();

	// The job ID
	JobID jobId = null;

	LOG.info("Waiting for job status running.");

	// Wait for running job
	while (jobId == null && deadline.hasTimeLeft()) {

		Collection<JobStatusMessage> jobs = clusterClient.listJobs().get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
		for (JobStatusMessage job : jobs) {
			if (job.getJobState() == JobStatus.RUNNING) {
				jobId = job.getJobId();
				LOG.info("Job running. ID: " + jobId);
				break;
			}
		}

		// Retry if job is not available yet
		if (jobId == null) {
			Thread.sleep(100L);
		}
	}

	// Trigger savepoint
	String savepointPath = null;
	for (int i = 0; i < 20; i++) {
		LOG.info("Triggering savepoint (" + (i + 1) + "/20).");
		try {
			savepointPath = clusterClient.triggerSavepoint(jobId, null)
				.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
		} catch (Exception cause) {
			LOG.info("Failed to trigger savepoint. Retrying...", cause);
			// This can fail if the operators are not opened yet
			Thread.sleep(500);
		}
	}

	assertNotNull("Failed to trigger savepoint", savepointPath);

	clusterClient.disposeSavepoint(savepointPath).get();

	clusterClient.cancel(jobId);

	// make sure, the execution is finished to not influence other test methods
	invokeThread.join(deadline.timeLeft().toMillis());
	assertFalse("Program invoke thread still running", invokeThread.isAlive());
}

Source File: RescalingITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}

Source File: PulsarTestBaseWithFlink.java From pulsar-flink with Apache License 2.0

4 votes

public static void cancelRunningJobs(ClusterClient<?> client) throws Exception {
    List<JobID> runningJobs = getRunningJobs(client);
    for (JobID runningJob : runningJobs) {
        client.cancel(runningJob);
    }
}

Source File: ClassLoaderITCase.java From flink with Apache License 2.0

4 votes

/**
 * Tests disposal of a savepoint, which contains custom user code KvState.
 */
@Test
public void testDisposeSavepointWithCustomKvState() throws Exception {
	ClusterClient<?> clusterClient = new MiniClusterClient(new Configuration(), miniClusterResource.getMiniCluster());

	Deadline deadline = new FiniteDuration(100, TimeUnit.SECONDS).fromNow();

	File checkpointDir = FOLDER.newFolder();
	File outputDir = FOLDER.newFolder();

	final PackagedProgram program = new PackagedProgram(
			new File(CUSTOM_KV_STATE_JAR_PATH),
			new String[] {
					String.valueOf(parallelism),
					checkpointDir.toURI().toString(),
					"5000",
					outputDir.toURI().toString()
			});

	TestStreamEnvironment.setAsContext(
		miniClusterResource.getMiniCluster(),
		parallelism,
		Collections.singleton(new Path(CUSTOM_KV_STATE_JAR_PATH)),
		Collections.<URL>emptyList()
	);

	// Execute detached
	Thread invokeThread = new Thread(new Runnable() {
		@Override
		public void run() {
			try {
				program.invokeInteractiveModeForExecution();
			} catch (ProgramInvocationException ignored) {
				if (ignored.getCause() == null ||
					!(ignored.getCause() instanceof JobCancellationException)) {
					ignored.printStackTrace();
				}
			}
		}
	});

	LOG.info("Starting program invoke thread");
	invokeThread.start();

	// The job ID
	JobID jobId = null;

	LOG.info("Waiting for job status running.");

	// Wait for running job
	while (jobId == null && deadline.hasTimeLeft()) {

		Collection<JobStatusMessage> jobs = clusterClient.listJobs().get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
		for (JobStatusMessage job : jobs) {
			if (job.getJobState() == JobStatus.RUNNING) {
				jobId = job.getJobId();
				LOG.info("Job running. ID: " + jobId);
				break;
			}
		}

		// Retry if job is not available yet
		if (jobId == null) {
			Thread.sleep(100L);
		}
	}

	// Trigger savepoint
	String savepointPath = null;
	for (int i = 0; i < 20; i++) {
		LOG.info("Triggering savepoint (" + (i + 1) + "/20).");
		try {
			savepointPath = clusterClient.triggerSavepoint(jobId, null)
				.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
		} catch (Exception cause) {
			LOG.info("Failed to trigger savepoint. Retrying...", cause);
			// This can fail if the operators are not opened yet
			Thread.sleep(500);
		}
	}

	assertNotNull("Failed to trigger savepoint", savepointPath);

	clusterClient.disposeSavepoint(savepointPath).get();

	clusterClient.cancel(jobId);

	// make sure, the execution is finished to not influence other test methods
	invokeThread.join(deadline.timeLeft().toMillis());
	assertFalse("Program invoke thread still running", invokeThread.isAlive());
}

Source File: RescalingITCase.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}

Source File: SavepointReaderITTestBase.java From flink with Apache License 2.0

4 votes

private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, SavepointReaderITCase.class.getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}

			try {
				Thread.sleep(2L);
			} catch (InterruptedException ignored) {
				Thread.currentThread().interrupt();
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}

Source File: CancelJob.java From bravo with Apache License 2.0

4 votes

@Override
public void onceExecuteClusterAction(ClusterClient<?> client, JobID id) throws Exception {
	client.cancel(id);
}

Java Code Examples for org.apache.flink.client.program.ClusterClient#cancel()