Java Code Examples for org.apache.flink.client.program.ClusterClient#setDetached()
The following examples show how to use
org.apache.flink.client.program.ClusterClient#setDetached() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testMigrationAndRestore() throws Throwable { ClassLoader classLoader = this.getClass().getClassLoader(); ClusterClient<?> clusterClient = MINI_CLUSTER_RESOURCE.getClusterClient(); clusterClient.setDetached(true); final Deadline deadline = Deadline.now().plus(TEST_TIMEOUT); // submit job with old version savepoint and create a migrated savepoint in the new version String savepointPath = migrateJob(classLoader, clusterClient, deadline); // restore from migrated new version savepoint restoreJob(classLoader, clusterClient, deadline, savepointPath); }
Example 2
Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testMigrationAndRestore() throws Throwable { ClassLoader classLoader = this.getClass().getClassLoader(); ClusterClient<?> clusterClient = cluster.getClusterClient(); clusterClient.setDetached(true); final Deadline deadline = Deadline.now().plus(TEST_TIMEOUT); // submit job with old version savepoint and create a migrated savepoint in the new version String savepointPath = migrateJob(classLoader, clusterClient, deadline); // restore from migrated new version savepoint restoreJob(classLoader, clusterClient, deadline, savepointPath); }
Example 3
Source File: CancelingTestBase.java From flink with Apache License 2.0 | 5 votes |
protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception { // submit job final JobGraph jobGraph = getJobGraph(plan); ClusterClient<?> client = CLUSTER.getClusterClient(); client.setDetached(true); JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader()); Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow(); JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) { Thread.sleep(50); jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); } if (jobStatus != JobStatus.RUNNING) { Assert.fail("Job not in state RUNNING."); } Thread.sleep(msecsTillCanceling); client.cancel(jobSubmissionResult.getJobID()); Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow(); JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) { Thread.sleep(50); jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); } if (jobStatusAfterCancel != JobStatus.CANCELED) { Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.'); } }
Example 4
Source File: SavepointReaderKeyedStateITCase.java From flink with Apache License 2.0 | 5 votes |
private String takeSavepoint(JobGraph jobGraph) throws Exception { SavepointSource.initializeForTest(); ClusterClient<?> client = miniClusterResource.getClusterClient(); client.setDetached(true); JobID jobId = jobGraph.getJobID(); Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5)); String dirPath = getTempDirPath(new AbstractID().toHexString()); try { client.setDetached(true); JobSubmissionResult result = client.submitJob(jobGraph, getClass().getClassLoader()); boolean finished = false; while (deadline.hasTimeLeft()) { if (SavepointSource.isFinished()) { finished = true; break; } } if (!finished) { Assert.fail("Failed to initialize state within deadline"); } CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath); return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); } finally { client.cancel(jobId); } }
Example 5
Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testTriggerSavepointWithCheckpointingDisabled() throws Exception { // Config final int numTaskManagers = 1; final int numSlotsPerTaskManager = 1; final Configuration config = new Configuration(); final MiniClusterWithClientResource cluster = new MiniClusterWithClientResource( new MiniClusterResourceConfiguration.Builder() .setConfiguration(config) .setNumberTaskManagers(numTaskManagers) .setNumberSlotsPerTaskManager(numSlotsPerTaskManager) .build()); cluster.before(); final ClusterClient<?> client = cluster.getClusterClient(); final JobVertex vertex = new JobVertex("Blocking vertex"); vertex.setInvokableClass(BlockingNoOpInvokable.class); vertex.setParallelism(1); final JobGraph graph = new JobGraph(vertex); try { client.setDetached(true); client.submitJob(graph, SavepointITCase.class.getClassLoader()); client.triggerSavepoint(graph.getJobID(), null).get(); fail(); } catch (ExecutionException e) { assertTrue(ExceptionUtils.findThrowable(e, IllegalStateException.class).isPresent()); assertTrue(ExceptionUtils.findThrowableWithMessage(e, graph.getJobID().toString()).isPresent()); assertTrue(ExceptionUtils.findThrowableWithMessage(e, "is not a streaming job").isPresent()); } finally { cluster.after(); } }
Example 6
Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception { final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000); jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); final JobID jobId = jobGraph.getJobID(); StatefulCounter.resetForTest(parallelism); MiniClusterWithClientResource cluster = clusterFactory.get(); cluster.before(); ClusterClient<?> client = cluster.getClusterClient(); try { client.setDetached(true); client.submitJob(jobGraph, SavepointITCase.class.getClassLoader()); // Await state is restored StatefulCounter.getRestoreLatch().await(); // Await some progress after restore StatefulCounter.getProgressLatch().await(); client.cancel(jobId); FutureUtils.retrySuccessfulWithDelay( () -> client.getJobStatus(jobId), Time.milliseconds(50), Deadline.now().plus(Duration.ofSeconds(30)), status -> status == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor() ); client.disposeSavepoint(savepointPath) .get(); assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists()); } finally { cluster.after(); StatefulCounter.resetForTest(parallelism); } }
Example 7
Source File: SavepointITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testTriggerSavepointWithCheckpointingDisabled() throws Exception { // Config final int numTaskManagers = 1; final int numSlotsPerTaskManager = 1; final Configuration config = new Configuration(); final MiniClusterWithClientResource cluster = new MiniClusterWithClientResource( new MiniClusterResourceConfiguration.Builder() .setConfiguration(config) .setNumberTaskManagers(numTaskManagers) .setNumberSlotsPerTaskManager(numSlotsPerTaskManager) .build()); cluster.before(); final ClusterClient<?> client = cluster.getClusterClient(); final JobVertex vertex = new JobVertex("Blocking vertex"); vertex.setInvokableClass(BlockingNoOpInvokable.class); vertex.setParallelism(1); final JobGraph graph = new JobGraph(vertex); try { client.setDetached(true); client.submitJob(graph, SavepointITCase.class.getClassLoader()); client.triggerSavepoint(graph.getJobID(), null).get(); fail(); } catch (ExecutionException e) { assertTrue(ExceptionUtils.findThrowable(e, IllegalStateException.class).isPresent()); assertTrue(ExceptionUtils.findThrowableWithMessage(e, graph.getJobID().toString()).isPresent()); assertTrue(ExceptionUtils.findThrowableWithMessage(e, "is not a streaming job").isPresent()); } finally { cluster.after(); } }
Example 8
Source File: CancelingTestBase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception { // submit job final JobGraph jobGraph = getJobGraph(plan); ClusterClient<?> client = CLUSTER.getClusterClient(); client.setDetached(true); JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader()); Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow(); JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) { Thread.sleep(50); jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); } if (jobStatus != JobStatus.RUNNING) { Assert.fail("Job not in state RUNNING."); } Thread.sleep(msecsTillCanceling); client.cancel(jobSubmissionResult.getJobID()); Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow(); JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) { Thread.sleep(50); jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS); } if (jobStatusAfterCancel != JobStatus.CANCELED) { Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.'); } }
Example 9
Source File: WebFrontendITCase.java From flink with Apache License 2.0 | 4 votes |
@Test public void testCancelYarn() throws Exception { // this only works if there is no active job at this point assertTrue(getRunningJobs(CLUSTER.getClusterClient()).isEmpty()); // Create a task final JobVertex sender = new JobVertex("Sender"); sender.setParallelism(2); sender.setInvokableClass(BlockingInvokable.class); final JobGraph jobGraph = new JobGraph("Stoppable streaming test job", sender); final JobID jid = jobGraph.getJobID(); ClusterClient<?> clusterClient = CLUSTER.getClusterClient(); clusterClient.setDetached(true); clusterClient.submitJob(jobGraph, WebFrontendITCase.class.getClassLoader()); // wait for job to show up while (getRunningJobs(CLUSTER.getClusterClient()).isEmpty()) { Thread.sleep(10); } // wait for tasks to be properly running BlockingInvokable.latch.await(); final FiniteDuration testTimeout = new FiniteDuration(2, TimeUnit.MINUTES); final Deadline deadline = testTimeout.fromNow(); try (HttpTestClient client = new HttpTestClient("localhost", getRestPort())) { // Request the file from the web server client.sendGetRequest("/jobs/" + jid + "/yarn-cancel", deadline.timeLeft()); HttpTestClient.SimpleHttpResponse response = client .getNextResponse(deadline.timeLeft()); assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus()); assertEquals("application/json; charset=UTF-8", response.getType()); assertEquals("{}", response.getContent()); } // wait for cancellation to finish while (!getRunningJobs(CLUSTER.getClusterClient()).isEmpty()) { Thread.sleep(20); } BlockingInvokable.reset(); }
Example 10
Source File: SavepointMigrationTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
@SafeVarargs protected final void restoreAndExecute( StreamExecutionEnvironment env, String savepointPath, Tuple2<String, Integer>... expectedAccumulators) throws Exception { ClusterClient<?> client = miniClusterResource.getClusterClient(); client.setDetached(true); // Submit the job JobGraph jobGraph = env.getStreamGraph().getJobGraph(); jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader()); boolean done = false; while (DEADLINE.hasTimeLeft()) { // try and get a job result, this will fail if the job already failed. Use this // to get out of this loop JobID jobId = jobSubmissionResult.getJobID(); try { CompletableFuture<JobStatus> jobStatusFuture = client.getJobStatus(jobSubmissionResult.getJobID()); JobStatus jobStatus = jobStatusFuture.get(5, TimeUnit.SECONDS); assertNotEquals(JobStatus.FAILED, jobStatus); } catch (Exception e) { fail("Could not connect to job: " + e); } Thread.sleep(100); Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobId); boolean allDone = true; for (Tuple2<String, Integer> acc : expectedAccumulators) { OptionalFailure<Object> numFinished = accumulators.get(acc.f0); if (numFinished == null) { allDone = false; break; } if (!numFinished.get().equals(acc.f1)) { allDone = false; break; } } if (allDone) { done = true; break; } } if (!done) { fail("Did not see the expected accumulator results within time limit."); } }
Example 11
Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
@Test public void testSubmitWithUnknownSavepointPath() throws Exception { // Config int numTaskManagers = 1; int numSlotsPerTaskManager = 1; int parallelism = numTaskManagers * numSlotsPerTaskManager; final Configuration config = new Configuration(); config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString()); MiniClusterWithClientResource cluster = new MiniClusterWithClientResource( new MiniClusterResourceConfiguration.Builder() .setConfiguration(config) .setNumberTaskManagers(numTaskManagers) .setNumberSlotsPerTaskManager(numSlotsPerTaskManager) .build()); cluster.before(); ClusterClient<?> client = cluster.getClusterClient(); try { // High value to ensure timeouts if restarted. int numberOfRetries = 1000; // Submit the job // Long delay to ensure that the test times out if the job // manager tries to restart the job. final JobGraph jobGraph = createJobGraph(parallelism, numberOfRetries, 3600000); // Set non-existing savepoint path jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath("unknown path")); assertEquals("unknown path", jobGraph.getSavepointRestoreSettings().getRestorePath()); LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode."); try { client.setDetached(false); client.submitJob(jobGraph, SavepointITCase.class.getClassLoader()); } catch (Exception e) { Optional<JobExecutionException> expectedJobExecutionException = ExceptionUtils.findThrowable(e, JobExecutionException.class); Optional<FileNotFoundException> expectedFileNotFoundException = ExceptionUtils.findThrowable(e, FileNotFoundException.class); if (!(expectedJobExecutionException.isPresent() && expectedFileNotFoundException.isPresent())) { throw e; } } } finally { cluster.after(); } }
Example 12
Source File: ResumeCheckpointManuallyITCase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
private void testExternalizedCheckpoints( File checkpointDir, String zooKeeperQuorum, StateBackend backend, boolean localRecovery) throws Exception { final Configuration config = new Configuration(); final File savepointDir = temporaryFolder.newFolder(); config.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY, checkpointDir.toURI().toString()); config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString()); config.setBoolean(CheckpointingOptions.LOCAL_RECOVERY, localRecovery); // ZooKeeper recovery mode? if (zooKeeperQuorum != null) { final File haDir = temporaryFolder.newFolder(); config.setString(HighAvailabilityOptions.HA_MODE, "ZOOKEEPER"); config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperQuorum); config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, haDir.toURI().toString()); } MiniClusterWithClientResource cluster = new MiniClusterWithClientResource( new MiniClusterResourceConfiguration.Builder() .setConfiguration(config) .setNumberTaskManagers(NUM_TASK_MANAGERS) .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) .build()); cluster.before(); ClusterClient<?> client = cluster.getClusterClient(); client.setDetached(true); try { // main test sequence: start job -> eCP -> restore job -> eCP -> restore job String firstExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, null, client); assertNotNull(firstExternalCheckpoint); String secondExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, firstExternalCheckpoint, client); assertNotNull(secondExternalCheckpoint); String thirdExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, secondExternalCheckpoint, client); assertNotNull(thirdExternalCheckpoint); } finally { cluster.after(); } }
Example 13
Source File: SavepointMigrationTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
@SafeVarargs protected final void executeAndSavepoint( StreamExecutionEnvironment env, String savepointPath, Tuple2<String, Integer>... expectedAccumulators) throws Exception { ClusterClient<?> client = miniClusterResource.getClusterClient(); client.setDetached(true); // Submit the job JobGraph jobGraph = env.getStreamGraph().getJobGraph(); JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader()); LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID()); boolean done = false; while (DEADLINE.hasTimeLeft()) { Thread.sleep(100); Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID()); boolean allDone = true; for (Tuple2<String, Integer> acc : expectedAccumulators) { OptionalFailure<Object> accumOpt = accumulators.get(acc.f0); if (accumOpt == null) { allDone = false; break; } Integer numFinished = (Integer) accumOpt.get(); if (numFinished == null) { allDone = false; break; } if (!numFinished.equals(acc.f1)) { allDone = false; break; } } if (allDone) { done = true; break; } } if (!done) { fail("Did not see the expected accumulator results within time limit."); } LOG.info("Triggering savepoint."); CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null); String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS); File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath()); // savepoints were changed to be directories in Flink 1.3 if (jobManagerSavepoint.isDirectory()) { FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath)); } else { FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath)); } }
Example 14
Source File: JMXJobManagerMetricTest.java From flink with Apache License 2.0 | 4 votes |
/** * Tests that metrics registered on the JobManager are actually accessible via JMX. */ @Test public void testJobManagerJMXMetricAccess() throws Exception { Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2)); try { JobVertex sourceJobVertex = new JobVertex("Source"); sourceJobVertex.setInvokableClass(BlockingInvokable.class); JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex); jobGraph.setSnapshotSettings(new JobCheckpointingSettings( Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), new CheckpointCoordinatorConfiguration( 500, 500, 50, 5, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0), null)); ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient(); client.setDetached(true); client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader()); FutureUtils.retrySuccessfulWithDelay( () -> client.getJobStatus(jobGraph.getJobID()), Time.milliseconds(10), deadline, status -> status == JobStatus.RUNNING, TestingUtils.defaultScheduledExecutor() ).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer(); Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null); Assert.assertEquals(1, nameSet.size()); assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value")); BlockingInvokable.unblock(); } finally { BlockingInvokable.unblock(); } }
Example 15
Source File: JMXJobManagerMetricTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests that metrics registered on the JobManager are actually accessible via JMX. */ @Test public void testJobManagerJMXMetricAccess() throws Exception { Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2)); try { JobVertex sourceJobVertex = new JobVertex("Source"); sourceJobVertex.setInvokableClass(BlockingInvokable.class); JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex); jobGraph.setSnapshotSettings(new JobCheckpointingSettings( Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), new CheckpointCoordinatorConfiguration( 500, 500, 50, 5, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true), null)); ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient(); client.setDetached(true); client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader()); FutureUtils.retrySuccessfulWithDelay( () -> client.getJobStatus(jobGraph.getJobID()), Time.milliseconds(10), deadline, status -> status == JobStatus.RUNNING, TestingUtils.defaultScheduledExecutor() ).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer(); Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null); Assert.assertEquals(1, nameSet.size()); assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value")); BlockingInvokable.unblock(); } finally { BlockingInvokable.unblock(); } }
Example 16
Source File: RescalingITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests that a job cannot be restarted from a savepoint with a different parallelism if the * rescaled operator has non-partitioned state. * * @throws Exception */ @Test public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception { final int parallelism = numSlots / 2; final int parallelism2 = numSlots; final int maxParallelism = 13; Duration timeout = Duration.ofMinutes(3); Deadline deadline = Deadline.now().plus(timeout); ClusterClient<?> client = cluster.getClusterClient(); try { JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED); final JobID jobID = jobGraph.getJobID(); client.setDetached(true); client.submitJob(jobGraph, RescalingITCase.class.getClassLoader()); // wait until the operator is started StateSourceBase.workStartedLatch.await(); CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null); final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); client.cancel(jobID); while (!getRunningJobs(client).isEmpty()) { Thread.sleep(50); } // job successfully removed JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED); scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); client.setDetached(false); client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader()); } catch (JobExecutionException exception) { if (exception.getCause() instanceof IllegalStateException) { // we expect a IllegalStateException wrapped // in a JobExecutionException, because the job containing non-partitioned state // is being rescaled } else { throw exception; } } }
Example 17
Source File: SavepointMigrationTestBase.java From flink with Apache License 2.0 | 4 votes |
@SafeVarargs protected final void executeAndSavepoint( StreamExecutionEnvironment env, String savepointPath, Tuple2<String, Integer>... expectedAccumulators) throws Exception { ClusterClient<?> client = miniClusterResource.getClusterClient(); client.setDetached(true); // Submit the job JobGraph jobGraph = env.getStreamGraph().getJobGraph(); JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader()); LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID()); boolean done = false; while (DEADLINE.hasTimeLeft()) { Thread.sleep(100); Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID()); boolean allDone = true; for (Tuple2<String, Integer> acc : expectedAccumulators) { OptionalFailure<Object> accumOpt = accumulators.get(acc.f0); if (accumOpt == null) { allDone = false; break; } Integer numFinished = (Integer) accumOpt.get(); if (numFinished == null) { allDone = false; break; } if (!numFinished.equals(acc.f1)) { allDone = false; break; } } if (allDone) { done = true; break; } } if (!done) { fail("Did not see the expected accumulator results within time limit."); } LOG.info("Triggering savepoint."); CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null); String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS); File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath()); // savepoints were changed to be directories in Flink 1.3 if (jobManagerSavepoint.isDirectory()) { FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath)); } else { FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath)); } }
Example 18
Source File: SavepointMigrationTestBase.java From flink with Apache License 2.0 | 4 votes |
@SafeVarargs protected final void restoreAndExecute( StreamExecutionEnvironment env, String savepointPath, Tuple2<String, Integer>... expectedAccumulators) throws Exception { ClusterClient<?> client = miniClusterResource.getClusterClient(); client.setDetached(true); // Submit the job JobGraph jobGraph = env.getStreamGraph().getJobGraph(); jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader()); boolean done = false; while (DEADLINE.hasTimeLeft()) { // try and get a job result, this will fail if the job already failed. Use this // to get out of this loop JobID jobId = jobSubmissionResult.getJobID(); try { CompletableFuture<JobStatus> jobStatusFuture = client.getJobStatus(jobSubmissionResult.getJobID()); JobStatus jobStatus = jobStatusFuture.get(5, TimeUnit.SECONDS); assertNotEquals(JobStatus.FAILED, jobStatus); } catch (Exception e) { fail("Could not connect to job: " + e); } Thread.sleep(100); Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobId); boolean allDone = true; for (Tuple2<String, Integer> acc : expectedAccumulators) { OptionalFailure<Object> numFinished = accumulators.get(acc.f0); if (numFinished == null) { allDone = false; break; } if (!numFinished.get().equals(acc.f1)) { allDone = false; break; } } if (allDone) { done = true; break; } } if (!done) { fail("Did not see the expected accumulator results within time limit."); } }
Example 19
Source File: SavepointReaderITTestBase.java From flink with Apache License 2.0 | 4 votes |
private String takeSavepoint(JobGraph jobGraph) throws Exception { SavepointSource.initializeForTest(); ClusterClient<?> client = miniClusterResource.getClusterClient(); client.setDetached(true); JobID jobId = jobGraph.getJobID(); Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5)); String dirPath = getTempDirPath(new AbstractID().toHexString()); try { client.setDetached(true); JobSubmissionResult result = client.submitJob(jobGraph, SavepointReaderITCase.class.getClassLoader()); boolean finished = false; while (deadline.hasTimeLeft()) { if (SavepointSource.isFinished()) { finished = true; break; } try { Thread.sleep(2L); } catch (InterruptedException ignored) { Thread.currentThread().interrupt(); } } if (!finished) { Assert.fail("Failed to initialize state within deadline"); } CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath); return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); } finally { client.cancel(jobId); } }
Example 20
Source File: ResumeCheckpointManuallyITCase.java From flink with Apache License 2.0 | 4 votes |
private void testExternalizedCheckpoints( File checkpointDir, String zooKeeperQuorum, StateBackend backend, boolean localRecovery) throws Exception { final Configuration config = new Configuration(); final File savepointDir = temporaryFolder.newFolder(); config.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY, checkpointDir.toURI().toString()); config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString()); config.setBoolean(CheckpointingOptions.LOCAL_RECOVERY, localRecovery); // ZooKeeper recovery mode? if (zooKeeperQuorum != null) { final File haDir = temporaryFolder.newFolder(); config.setString(HighAvailabilityOptions.HA_MODE, "ZOOKEEPER"); config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperQuorum); config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, haDir.toURI().toString()); } MiniClusterWithClientResource cluster = new MiniClusterWithClientResource( new MiniClusterResourceConfiguration.Builder() .setConfiguration(config) .setNumberTaskManagers(NUM_TASK_MANAGERS) .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) .build()); cluster.before(); ClusterClient<?> client = cluster.getClusterClient(); client.setDetached(true); try { // main test sequence: start job -> eCP -> restore job -> eCP -> restore job String firstExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, null, client); assertNotNull(firstExternalCheckpoint); String secondExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, firstExternalCheckpoint, client); assertNotNull(secondExternalCheckpoint); String thirdExternalCheckpoint = runJobAndGetExternalizedCheckpoint(backend, checkpointDir, secondExternalCheckpoint, client); assertNotNull(thirdExternalCheckpoint); } finally { cluster.after(); } }