org.apache.flink.client.program.ClusterClient#setDetached

Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testMigrationAndRestore() throws Throwable {
	ClassLoader classLoader = this.getClass().getClassLoader();
	ClusterClient<?> clusterClient = MINI_CLUSTER_RESOURCE.getClusterClient();
	clusterClient.setDetached(true);
	final Deadline deadline = Deadline.now().plus(TEST_TIMEOUT);

	// submit job with old version savepoint and create a migrated savepoint in the new version
	String savepointPath = migrateJob(classLoader, clusterClient, deadline);
	// restore from migrated new version savepoint
	restoreJob(classLoader, clusterClient, deadline, savepointPath);
}

Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0

5 votes

@Test
public void testMigrationAndRestore() throws Throwable {
	ClassLoader classLoader = this.getClass().getClassLoader();
	ClusterClient<?> clusterClient = cluster.getClusterClient();
	clusterClient.setDetached(true);
	final Deadline deadline = Deadline.now().plus(TEST_TIMEOUT);

	// submit job with old version savepoint and create a migrated savepoint in the new version
	String savepointPath = migrateJob(classLoader, clusterClient, deadline);
	// restore from migrated new version savepoint
	restoreJob(classLoader, clusterClient, deadline, savepointPath);
}

Source File: CancelingTestBase.java From flink with Apache License 2.0

5 votes

protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
	// submit job
	final JobGraph jobGraph = getJobGraph(plan);

	ClusterClient<?> client = CLUSTER.getClusterClient();
	client.setDetached(true);

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader());

	Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();

	JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatus != JobStatus.RUNNING) {
		Assert.fail("Job not in state RUNNING.");
	}

	Thread.sleep(msecsTillCanceling);

	client.cancel(jobSubmissionResult.getJobID());

	Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow();

	JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatusAfterCancel != JobStatus.CANCELED) {
		Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.');
	}
}

Source File: SavepointReaderKeyedStateITCase.java From flink with Apache License 2.0

5 votes

private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, getClass().getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testTriggerSavepointWithCheckpointingDisabled() throws Exception {
	// Config
	final int numTaskManagers = 1;
	final int numSlotsPerTaskManager = 1;

	final Configuration config = new Configuration();

	final MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(numTaskManagers)
			.setNumberSlotsPerTaskManager(numSlotsPerTaskManager)
			.build());
	cluster.before();
	final ClusterClient<?> client = cluster.getClusterClient();

	final JobVertex vertex = new JobVertex("Blocking vertex");
	vertex.setInvokableClass(BlockingNoOpInvokable.class);
	vertex.setParallelism(1);

	final JobGraph graph = new JobGraph(vertex);

	try {
		client.setDetached(true);
		client.submitJob(graph, SavepointITCase.class.getClassLoader());

		client.triggerSavepoint(graph.getJobID(), null).get();

		fail();
	} catch (ExecutionException e) {
		assertTrue(ExceptionUtils.findThrowable(e, IllegalStateException.class).isPresent());
		assertTrue(ExceptionUtils.findThrowableWithMessage(e, graph.getJobID().toString()).isPresent());
		assertTrue(ExceptionUtils.findThrowableWithMessage(e, "is not a streaming job").isPresent());
	} finally {
		cluster.after();
	}
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: SavepointITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testTriggerSavepointWithCheckpointingDisabled() throws Exception {
	// Config
	final int numTaskManagers = 1;
	final int numSlotsPerTaskManager = 1;

	final Configuration config = new Configuration();

	final MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(numTaskManagers)
			.setNumberSlotsPerTaskManager(numSlotsPerTaskManager)
			.build());
	cluster.before();
	final ClusterClient<?> client = cluster.getClusterClient();

	final JobVertex vertex = new JobVertex("Blocking vertex");
	vertex.setInvokableClass(BlockingNoOpInvokable.class);
	vertex.setParallelism(1);

	final JobGraph graph = new JobGraph(vertex);

	try {
		client.setDetached(true);
		client.submitJob(graph, SavepointITCase.class.getClassLoader());

		client.triggerSavepoint(graph.getJobID(), null).get();

		fail();
	} catch (ExecutionException e) {
		assertTrue(ExceptionUtils.findThrowable(e, IllegalStateException.class).isPresent());
		assertTrue(ExceptionUtils.findThrowableWithMessage(e, graph.getJobID().toString()).isPresent());
		assertTrue(ExceptionUtils.findThrowableWithMessage(e, "is not a streaming job").isPresent());
	} finally {
		cluster.after();
	}
}

Source File: CancelingTestBase.java From Flink-CEPplus with Apache License 2.0

5 votes

protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
	// submit job
	final JobGraph jobGraph = getJobGraph(plan);

	ClusterClient<?> client = CLUSTER.getClusterClient();
	client.setDetached(true);

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader());

	Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();

	JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatus != JobStatus.RUNNING) {
		Assert.fail("Job not in state RUNNING.");
	}

	Thread.sleep(msecsTillCanceling);

	client.cancel(jobSubmissionResult.getJobID());

	Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow();

	JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatusAfterCancel != JobStatus.CANCELED) {
		Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.');
	}
}

Source File: WebFrontendITCase.java From flink with Apache License 2.0

4 votes

@Test
public void testCancelYarn() throws Exception {
	// this only works if there is no active job at this point
	assertTrue(getRunningJobs(CLUSTER.getClusterClient()).isEmpty());

	// Create a task
	final JobVertex sender = new JobVertex("Sender");
	sender.setParallelism(2);
	sender.setInvokableClass(BlockingInvokable.class);

	final JobGraph jobGraph = new JobGraph("Stoppable streaming test job", sender);
	final JobID jid = jobGraph.getJobID();

	ClusterClient<?> clusterClient = CLUSTER.getClusterClient();
	clusterClient.setDetached(true);
	clusterClient.submitJob(jobGraph, WebFrontendITCase.class.getClassLoader());

	// wait for job to show up
	while (getRunningJobs(CLUSTER.getClusterClient()).isEmpty()) {
		Thread.sleep(10);
	}

	// wait for tasks to be properly running
	BlockingInvokable.latch.await();

	final FiniteDuration testTimeout = new FiniteDuration(2, TimeUnit.MINUTES);
	final Deadline deadline = testTimeout.fromNow();

	try (HttpTestClient client = new HttpTestClient("localhost", getRestPort())) {
		// Request the file from the web server
		client.sendGetRequest("/jobs/" + jid + "/yarn-cancel", deadline.timeLeft());

		HttpTestClient.SimpleHttpResponse response = client
			.getNextResponse(deadline.timeLeft());

		assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
		assertEquals("application/json; charset=UTF-8", response.getType());
		assertEquals("{}", response.getContent());
	}

	// wait for cancellation to finish
	while (!getRunningJobs(CLUSTER.getClusterClient()).isEmpty()) {
		Thread.sleep(20);
	}

	BlockingInvokable.reset();
}

Source File: SavepointMigrationTestBase.java From Flink-CEPplus with Apache License 2.0

4 votes

@SafeVarargs
protected final void restoreAndExecute(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {

		// try and get a job result, this will fail if the job already failed. Use this
		// to get out of this loop
		JobID jobId = jobSubmissionResult.getJobID();

		try {
			CompletableFuture<JobStatus> jobStatusFuture = client.getJobStatus(jobSubmissionResult.getJobID());

			JobStatus jobStatus = jobStatusFuture.get(5, TimeUnit.SECONDS);

			assertNotEquals(JobStatus.FAILED, jobStatus);
		} catch (Exception e) {
			fail("Could not connect to job: " + e);
		}

		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobId);

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> numFinished = accumulators.get(acc.f0);
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.get().equals(acc.f1)) {
				allDone = false;
				break;
			}
		}

		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testSubmitWithUnknownSavepointPath() throws Exception {
	// Config
	int numTaskManagers = 1;
	int numSlotsPerTaskManager = 1;
	int parallelism = numTaskManagers * numSlotsPerTaskManager;

	final Configuration config = new Configuration();
	config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString());

	MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(numTaskManagers)
			.setNumberSlotsPerTaskManager(numSlotsPerTaskManager)
			.build());
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {

		// High value to ensure timeouts if restarted.
		int numberOfRetries = 1000;
		// Submit the job
		// Long delay to ensure that the test times out if the job
		// manager tries to restart the job.
		final JobGraph jobGraph = createJobGraph(parallelism, numberOfRetries, 3600000);

		// Set non-existing savepoint path
		jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath("unknown path"));
		assertEquals("unknown path", jobGraph.getSavepointRestoreSettings().getRestorePath());

		LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");

		try {
			client.setDetached(false);
			client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());
		} catch (Exception e) {
			Optional<JobExecutionException> expectedJobExecutionException = ExceptionUtils.findThrowable(e, JobExecutionException.class);
			Optional<FileNotFoundException> expectedFileNotFoundException = ExceptionUtils.findThrowable(e, FileNotFoundException.class);
			if (!(expectedJobExecutionException.isPresent() && expectedFileNotFoundException.isPresent())) {
				throw e;
			}
		}
	} finally {
		cluster.after();
	}
}

Source File: ResumeCheckpointManuallyITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

private void testExternalizedCheckpoints(
	File checkpointDir,
	String zooKeeperQuorum,
	StateBackend backend,
	boolean localRecovery) throws Exception {

	final Configuration config = new Configuration();

	final File savepointDir = temporaryFolder.newFolder();

	config.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY, checkpointDir.toURI().toString());
	config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString());
	config.setBoolean(CheckpointingOptions.LOCAL_RECOVERY, localRecovery);

	// ZooKeeper recovery mode?
	if (zooKeeperQuorum != null) {
		final File haDir = temporaryFolder.newFolder();
		config.setString(HighAvailabilityOptions.HA_MODE, "ZOOKEEPER");
		config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperQuorum);
		config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, haDir.toURI().toString());
	}

	MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(NUM_TASK_MANAGERS)
			.setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER)
			.build());

	cluster.before();

	ClusterClient<?> client = cluster.getClusterClient();
	client.setDetached(true);

	try {
		// main test sequence:  start job -> eCP -> restore job -> eCP -> restore job
		String firstExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, null, client);
		assertNotNull(firstExternalCheckpoint);

		String secondExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, firstExternalCheckpoint, client);
		assertNotNull(secondExternalCheckpoint);

		String thirdExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, secondExternalCheckpoint, client);
		assertNotNull(thirdExternalCheckpoint);
	} finally {
		cluster.after();
	}
}

Source File: SavepointMigrationTestBase.java From Flink-CEPplus with Apache License 2.0

4 votes

@SafeVarargs
protected final void executeAndSavepoint(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {
		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID());

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> accumOpt = accumulators.get(acc.f0);
			if (accumOpt == null) {
				allDone = false;
				break;
			}

			Integer numFinished = (Integer) accumOpt.get();
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.equals(acc.f1)) {
				allDone = false;
				break;
			}
		}
		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}

	LOG.info("Triggering savepoint.");

	CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null);

	String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

	File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath());
	// savepoints were changed to be directories in Flink 1.3
	if (jobManagerSavepoint.isDirectory()) {
		FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath));
	} else {
		FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath));
	}
}

Source File: JMXJobManagerMetricTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that metrics registered on the JobManager are actually accessible via JMX.
 */
@Test
public void testJobManagerJMXMetricAccess() throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));

	try {
		JobVertex sourceJobVertex = new JobVertex("Source");
		sourceJobVertex.setInvokableClass(BlockingInvokable.class);

		JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
		jobGraph.setSnapshotSettings(new JobCheckpointingSettings(
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			new CheckpointCoordinatorConfiguration(
				500,
				500,
				50,
				5,
				CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
				true,
				false,
				0),
			null));

		ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
		client.setDetached(true);
		client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader());

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobGraph.getJobID()),
			Time.milliseconds(10),
			deadline,
			status -> status == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
		Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null);
		Assert.assertEquals(1, nameSet.size());
		assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value"));

		BlockingInvokable.unblock();
	} finally {
		BlockingInvokable.unblock();
	}
}

Source File: JMXJobManagerMetricTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that metrics registered on the JobManager are actually accessible via JMX.
 */
@Test
public void testJobManagerJMXMetricAccess() throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));

	try {
		JobVertex sourceJobVertex = new JobVertex("Source");
		sourceJobVertex.setInvokableClass(BlockingInvokable.class);

		JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
		jobGraph.setSnapshotSettings(new JobCheckpointingSettings(
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			new CheckpointCoordinatorConfiguration(
				500,
				500,
				50,
				5,
				CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
				true),
			null));

		ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
		client.setDetached(true);
		client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader());

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobGraph.getJobID()),
			Time.milliseconds(10),
			deadline,
			status -> status == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
		Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null);
		Assert.assertEquals(1, nameSet.size());
		assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value"));

		BlockingInvokable.unblock();
	} finally {
		BlockingInvokable.unblock();
	}
}

Source File: RescalingITCase.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}

Source File: SavepointMigrationTestBase.java From flink with Apache License 2.0

4 votes

@SafeVarargs
protected final void executeAndSavepoint(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {
		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID());

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> accumOpt = accumulators.get(acc.f0);
			if (accumOpt == null) {
				allDone = false;
				break;
			}

			Integer numFinished = (Integer) accumOpt.get();
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.equals(acc.f1)) {
				allDone = false;
				break;
			}
		}
		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}

	LOG.info("Triggering savepoint.");

	CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null);

	String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

	File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath());
	// savepoints were changed to be directories in Flink 1.3
	if (jobManagerSavepoint.isDirectory()) {
		FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath));
	} else {
		FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath));
	}
}

Source File: SavepointMigrationTestBase.java From flink with Apache License 2.0

4 votes

@SafeVarargs
protected final void restoreAndExecute(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {

		// try and get a job result, this will fail if the job already failed. Use this
		// to get out of this loop
		JobID jobId = jobSubmissionResult.getJobID();

		try {
			CompletableFuture<JobStatus> jobStatusFuture = client.getJobStatus(jobSubmissionResult.getJobID());

			JobStatus jobStatus = jobStatusFuture.get(5, TimeUnit.SECONDS);

			assertNotEquals(JobStatus.FAILED, jobStatus);
		} catch (Exception e) {
			fail("Could not connect to job: " + e);
		}

		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobId);

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> numFinished = accumulators.get(acc.f0);
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.get().equals(acc.f1)) {
				allDone = false;
				break;
			}
		}

		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}
}

Source File: SavepointReaderITTestBase.java From flink with Apache License 2.0

4 votes

private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, SavepointReaderITCase.class.getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}

			try {
				Thread.sleep(2L);
			} catch (InterruptedException ignored) {
				Thread.currentThread().interrupt();
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}

Source File: ResumeCheckpointManuallyITCase.java From flink with Apache License 2.0

4 votes

private void testExternalizedCheckpoints(
	File checkpointDir,
	String zooKeeperQuorum,
	StateBackend backend,
	boolean localRecovery) throws Exception {

	final Configuration config = new Configuration();

	final File savepointDir = temporaryFolder.newFolder();

	config.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY, checkpointDir.toURI().toString());
	config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString());
	config.setBoolean(CheckpointingOptions.LOCAL_RECOVERY, localRecovery);

	// ZooKeeper recovery mode?
	if (zooKeeperQuorum != null) {
		final File haDir = temporaryFolder.newFolder();
		config.setString(HighAvailabilityOptions.HA_MODE, "ZOOKEEPER");
		config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperQuorum);
		config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, haDir.toURI().toString());
	}

	MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(NUM_TASK_MANAGERS)
			.setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER)
			.build());

	cluster.before();

	ClusterClient<?> client = cluster.getClusterClient();
	client.setDetached(true);

	try {
		// main test sequence:  start job -> eCP -> restore job -> eCP -> restore job
		String firstExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, null, client);
		assertNotNull(firstExternalCheckpoint);

		String secondExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, firstExternalCheckpoint, client);
		assertNotNull(secondExternalCheckpoint);

		String thirdExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, secondExternalCheckpoint, client);
		assertNotNull(thirdExternalCheckpoint);
	} finally {
		cluster.after();
	}
}

Java Code Examples for org.apache.flink.client.program.ClusterClient#setDetached()