Java Code Examples for org.apache.flink.runtime.jobgraph.JobGraph#setSavepointRestoreSettings()
The following examples show how to use
org.apache.flink.runtime.jobgraph.JobGraph#setSavepointRestoreSettings() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ClusterClient.java From flink with Apache License 2.0 | 6 votes |
public static JobGraph getJobGraph(Configuration flinkConfig, FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, SavepointRestoreSettings savepointSettings) { JobGraph job; if (optPlan instanceof StreamingPlan) { job = ((StreamingPlan) optPlan).getJobGraph(); job.setSavepointRestoreSettings(savepointSettings); } else { JobGraphGenerator gen = new JobGraphGenerator(flinkConfig); job = gen.compileJobGraph((OptimizedPlan) optPlan); } for (URL jar : jarFiles) { try { job.addJar(new Path(jar.toURI())); } catch (URISyntaxException e) { throw new RuntimeException("URL is invalid. This should not happen.", e); } } job.setClasspaths(classpaths); return job; }
Example 2
Source File: ClassPathJobGraphRetriever.java From flink with Apache License 2.0 | 6 votes |
@Override public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException { final PackagedProgram packagedProgram = createPackagedProgram(); final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM); try { final JobGraph jobGraph = PackagedProgramUtils.createJobGraph( packagedProgram, configuration, defaultParallelism, jobId); jobGraph.setAllowQueuedScheduling(true); jobGraph.setSavepointRestoreSettings(savepointRestoreSettings); return jobGraph; } catch (Exception e) { throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e); } }
Example 3
Source File: JobMasterTest.java From flink with Apache License 2.0 | 6 votes |
@Nonnull private JobGraph createJobGraphFromJobVerticesWithCheckpointing(SavepointRestoreSettings savepointRestoreSettings, JobVertex... jobVertices) { final JobGraph jobGraph = new JobGraph(jobVertices); // enable checkpointing which is required to resume from a savepoint final CheckpointCoordinatorConfiguration checkpoinCoordinatorConfiguration = new CheckpointCoordinatorConfiguration( 1000L, 1000L, 1000L, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0); final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings( Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), checkpoinCoordinatorConfiguration, null); jobGraph.setSnapshotSettings(checkpointingSettings); jobGraph.setSavepointRestoreSettings(savepointRestoreSettings); return jobGraph; }
Example 4
Source File: DFCusterClient.java From df_data_service with Apache License 2.0 | 6 votes |
private JobGraph getJobGraph(FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, SavepointRestoreSettings savepointSettings) { JobGraph job; if (optPlan instanceof StreamingPlan) { job = ((StreamingPlan) optPlan).getJobGraph(); job.setSavepointRestoreSettings(savepointSettings); } else { JobGraphGenerator gen = new JobGraphGenerator(this.flinkConfig); job = gen.compileJobGraph((OptimizedPlan) optPlan); } for (URL jar : jarFiles) { try { job.addJar(new Path(jar.toURI())); } catch (URISyntaxException e) { throw new RuntimeException("URL is invalid. This should not happen.", e); } } job.setClasspaths(classpaths); return job; }
Example 5
Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0 | 6 votes |
private void restoreJob(ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception { JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE); jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState)); assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID()); ClientUtils.submitJob(clusterClient, jobToRestore); CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobToRestore.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.FINISHED, TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.FINISHED, jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); }
Example 6
Source File: JobMasterTest.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
@Nonnull private JobGraph createJobGraphFromJobVerticesWithCheckpointing(SavepointRestoreSettings savepointRestoreSettings, JobVertex... jobVertices) { final JobGraph jobGraph = new JobGraph(jobVertices); // enable checkpointing which is required to resume from a savepoint final CheckpointCoordinatorConfiguration checkpoinCoordinatorConfiguration = new CheckpointCoordinatorConfiguration( 1000L, 1000L, 1000L, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true); final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings( Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), checkpoinCoordinatorConfiguration, null); jobGraph.setSnapshotSettings(checkpointingSettings); jobGraph.setSavepointRestoreSettings(savepointRestoreSettings); return jobGraph; }
Example 7
Source File: ClassPathJobGraphRetriever.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
@Override public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException { final PackagedProgram packagedProgram = createPackagedProgram(); final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM); try { final JobGraph jobGraph = PackagedProgramUtils.createJobGraph( packagedProgram, configuration, defaultParallelism, jobId); jobGraph.setAllowQueuedScheduling(true); jobGraph.setSavepointRestoreSettings(savepointRestoreSettings); return jobGraph; } catch (Exception e) { throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e); } }
Example 8
Source File: PipelineExecutorUtils.java From flink with Apache License 2.0 | 6 votes |
/** * Creates the {@link JobGraph} corresponding to the provided {@link Pipeline}. * * @param pipeline the pipeline whose job graph we are computing * @param configuration the configuration with the necessary information such as jars and * classpaths to be included, the parallelism of the job and potential * savepoint settings used to bootstrap its state. * @return the corresponding {@link JobGraph}. */ public static JobGraph getJobGraph(@Nonnull final Pipeline pipeline, @Nonnull final Configuration configuration) throws MalformedURLException { checkNotNull(pipeline); checkNotNull(configuration); final ExecutionConfigAccessor executionConfigAccessor = ExecutionConfigAccessor.fromConfiguration(configuration); final JobGraph jobGraph = FlinkPipelineTranslationUtil .getJobGraph(pipeline, configuration, executionConfigAccessor.getParallelism()); configuration .getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID) .ifPresent(strJobID -> jobGraph.setJobID(JobID.fromHexString(strJobID))); jobGraph.addJars(executionConfigAccessor.getJars()); jobGraph.setClasspaths(executionConfigAccessor.getClasspaths()); jobGraph.setSavepointRestoreSettings(executionConfigAccessor.getSavepointRestoreSettings()); return jobGraph; }
Example 9
Source File: StatefulFunctionsJobGraphRetriever.java From stateful-functions with Apache License 2.0 | 6 votes |
@Override public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException { final PackagedProgram packagedProgram = createPackagedProgram(); final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM); try { final JobGraph jobGraph = PackagedProgramUtils.createJobGraph( packagedProgram, configuration, defaultParallelism, jobId, false); jobGraph.setSavepointRestoreSettings(savepointRestoreSettings); return jobGraph; } catch (Exception e) { throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e); } }
Example 10
Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
private void restoreJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception { JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE); jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState)); assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID()); clusterClient.submitJob(jobToRestore, classLoader); CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobToRestore.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.FINISHED, TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.FINISHED, jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); }
Example 11
Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception { final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000); jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); final JobID jobId = jobGraph.getJobID(); StatefulCounter.resetForTest(parallelism); MiniClusterWithClientResource cluster = clusterFactory.get(); cluster.before(); ClusterClient<?> client = cluster.getClusterClient(); try { client.setDetached(true); client.submitJob(jobGraph, SavepointITCase.class.getClassLoader()); // Await state is restored StatefulCounter.getRestoreLatch().await(); // Await some progress after restore StatefulCounter.getProgressLatch().await(); client.cancel(jobId); FutureUtils.retrySuccessfulWithDelay( () -> client.getJobStatus(jobId), Time.milliseconds(50), Deadline.now().plus(Duration.ofSeconds(30)), status -> status == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor() ); client.disposeSavepoint(savepointPath) .get(); assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists()); } finally { cluster.after(); StatefulCounter.resetForTest(parallelism); } }
Example 12
Source File: FlinkRequiresStableInputTest.java From beam with Apache License 2.0 | 5 votes |
private JobID restoreFromSavepoint(Pipeline pipeline, String savepointDir) throws ExecutionException, InterruptedException { JobGraph jobGraph = getJobGraph(pipeline); SavepointRestoreSettings savepointSettings = SavepointRestoreSettings.forPath(savepointDir); jobGraph.setSavepointRestoreSettings(savepointSettings); return flinkCluster.submitJob(jobGraph).get().getJobID(); }
Example 13
Source File: SavepointITCase.java From flink with Apache License 2.0 | 5 votes |
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception { final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000); jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); final JobID jobId = jobGraph.getJobID(); StatefulCounter.resetForTest(parallelism); MiniClusterWithClientResource cluster = clusterFactory.get(); cluster.before(); ClusterClient<?> client = cluster.getClusterClient(); try { client.setDetached(true); client.submitJob(jobGraph, SavepointITCase.class.getClassLoader()); // Await state is restored StatefulCounter.getRestoreLatch().await(); // Await some progress after restore StatefulCounter.getProgressLatch().await(); client.cancel(jobId); FutureUtils.retrySuccessfulWithDelay( () -> client.getJobStatus(jobId), Time.milliseconds(50), Deadline.now().plus(Duration.ofSeconds(30)), status -> status == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor() ); client.disposeSavepoint(savepointPath) .get(); assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists()); } finally { cluster.after(); StatefulCounter.resetForTest(parallelism); } }
Example 14
Source File: SavepointWriterITCase.java From flink with Apache License 2.0 | 5 votes |
private void validateModification(String savepointPath) throws ProgramInvocationException { StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment(); sEnv.setStateBackend(backend); CollectSink.accountList.clear(); DataStream<Account> stream = sEnv.fromCollection(accounts) .keyBy(acc -> acc.id) .flatMap(new UpdateAndGetAccount()) .uid(ACCOUNT_UID); stream.addSink(new CollectSink()); stream .map(acc -> acc.id) .map(new StatefulOperator()) .uid(MODIFY_UID) .addSink(new DiscardingSink<>()); JobGraph jobGraph = sEnv.getStreamGraph().getJobGraph(); jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false)); ClusterClient<?> client = miniClusterResource.getClusterClient(); client.submitJob(jobGraph, SavepointWriterITCase.class.getClassLoader()); Assert.assertEquals("Unexpected output", 3, CollectSink.accountList.size()); }
Example 15
Source File: JobMasterTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests that a JobMaster will only restore a modified JobGraph if non * restored state is allowed. */ @Test public void testRestoringModifiedJobFromSavepoint() throws Exception { // create savepoint data final long savepointId = 42L; final OperatorID operatorID = new OperatorID(); final File savepointFile = createSavepointWithOperatorState(savepointId, operatorID); // set savepoint settings which don't allow non restored state final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath( savepointFile.getAbsolutePath(), false); // create a new operator final JobVertex jobVertex = new JobVertex("New operator"); jobVertex.setInvokableClass(NoOpInvokable.class); final JobGraph jobGraphWithNewOperator = createJobGraphFromJobVerticesWithCheckpointing(savepointRestoreSettings, jobVertex); final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1); final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter()); haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory); try { createJobMaster( configuration, jobGraphWithNewOperator, haServices, new TestingJobManagerSharedServicesBuilder().build()); fail("Should fail because we cannot resume the changed JobGraph from the savepoint."); } catch (IllegalStateException expected) { // that was expected :-) } // allow for non restored state jobGraphWithNewOperator.setSavepointRestoreSettings( SavepointRestoreSettings.forPath( savepointFile.getAbsolutePath(), true)); final JobMaster jobMaster = createJobMaster( configuration, jobGraphWithNewOperator, haServices, new TestingJobManagerSharedServicesBuilder().build()); try { // starting the JobMaster should have read the savepoint final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint(); assertThat(savepointCheckpoint, Matchers.notNullValue()); assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId)); } finally { RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout); } }
Example 16
Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable { URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName()); if (savepointResource == null) { throw new IllegalArgumentException("Savepoint file does not exist."); } JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE); jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile())); assertNotNull(jobToMigrate.getJobID()); clusterClient.submitJob(jobToMigrate, classLoader); CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobToMigrate.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.RUNNING, TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.RUNNING, jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); // Trigger savepoint File targetDirectory = tmpFolder.newFolder(); String savepointPath = null; // FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running // TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714) while (deadline.hasTimeLeft() && savepointPath == null) { try { savepointPath = clusterClient.cancelWithSavepoint( jobToMigrate.getJobID(), targetDirectory.getAbsolutePath()); } catch (Exception e) { String exceptionString = ExceptionUtils.stringifyException(e); if (!(exceptionString.matches("(.*\n)*.*savepoint for the job .* failed(.*\n)*") // legacy || exceptionString.matches("(.*\n)*.*was not running(.*\n)*") || exceptionString.matches("(.*\n)*.*Not all required tasks are currently running(.*\n)*") // new || exceptionString.matches("(.*\n)*.*Checkpoint was declined \\(tasks not ready\\)(.*\n)*"))) { // new throw e; } } } assertNotNull("Could not take savepoint.", savepointPath); CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobToMigrate.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.CANCELED, jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); return savepointPath; }
Example 17
Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0 | 4 votes |
private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable { URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName()); if (savepointResource == null) { throw new IllegalArgumentException("Savepoint file does not exist."); } JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE); jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile())); assertNotNull(jobToMigrate.getJobID()); clusterClient.submitJob(jobToMigrate, classLoader); CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobToMigrate.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.RUNNING, TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.RUNNING, jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); // Trigger savepoint File targetDirectory = tmpFolder.newFolder(); String savepointPath = null; // FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running // TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714) while (deadline.hasTimeLeft() && savepointPath == null) { try { savepointPath = clusterClient.cancelWithSavepoint( jobToMigrate.getJobID(), targetDirectory.getAbsolutePath()); } catch (Exception e) { String exceptionString = ExceptionUtils.stringifyException(e); if (!PATTERN_CANCEL_WITH_SAVEPOINT_TOLERATED_EXCEPTIONS.matcher(exceptionString).find()) { throw e; } } } assertNotNull("Could not take savepoint.", savepointPath); CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobToMigrate.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.CANCELED, jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); return savepointPath; }
Example 18
Source File: JobMasterTest.java From flink with Apache License 2.0 | 4 votes |
/** * Tests that a JobMaster will only restore a modified JobGraph if non * restored state is allowed. */ @Test public void testRestoringModifiedJobFromSavepoint() throws Exception { // create savepoint data final long savepointId = 42L; final OperatorID operatorID = new OperatorID(); final File savepointFile = createSavepointWithOperatorState(savepointId, operatorID); // set savepoint settings which don't allow non restored state final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath( savepointFile.getAbsolutePath(), false); // create a new operator final JobVertex jobVertex = new JobVertex("New operator"); jobVertex.setInvokableClass(NoOpInvokable.class); final JobGraph jobGraphWithNewOperator = createJobGraphFromJobVerticesWithCheckpointing(savepointRestoreSettings, jobVertex); final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1); final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter()); haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory); try { createJobMaster( configuration, jobGraphWithNewOperator, haServices, new TestingJobManagerSharedServicesBuilder().build()); fail("Should fail because we cannot resume the changed JobGraph from the savepoint."); } catch (IllegalStateException expected) { // that was expected :-) } // allow for non restored state jobGraphWithNewOperator.setSavepointRestoreSettings( SavepointRestoreSettings.forPath( savepointFile.getAbsolutePath(), true)); final JobMaster jobMaster = createJobMaster( configuration, jobGraphWithNewOperator, haServices, new TestingJobManagerSharedServicesBuilder().build()); try { // starting the JobMaster should have read the savepoint final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint(false); assertThat(savepointCheckpoint, Matchers.notNullValue()); assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId)); } finally { RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout); } }
Example 19
Source File: RescalingITCase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests that a job cannot be restarted from a savepoint with a different parallelism if the * rescaled operator has non-partitioned state. * * @throws Exception */ @Test public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception { final int parallelism = numSlots / 2; final int parallelism2 = numSlots; final int maxParallelism = 13; Duration timeout = Duration.ofMinutes(3); Deadline deadline = Deadline.now().plus(timeout); ClusterClient<?> client = cluster.getClusterClient(); try { JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED); final JobID jobID = jobGraph.getJobID(); client.setDetached(true); client.submitJob(jobGraph, RescalingITCase.class.getClassLoader()); // wait until the operator is started StateSourceBase.workStartedLatch.await(); CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null); final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); client.cancel(jobID); while (!getRunningJobs(client).isEmpty()) { Thread.sleep(50); } // job successfully removed JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED); scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); client.setDetached(false); client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader()); } catch (JobExecutionException exception) { if (exception.getCause() instanceof IllegalStateException) { // we expect a IllegalStateException wrapped // in a JobExecutionException, because the job containing non-partitioned state // is being rescaled } else { throw exception; } } }
Example 20
Source File: FlinkPravegaReaderSavepointITCase.java From flink-connectors with Apache License 2.0 | 4 votes |
@Test public void testPravegaWithSavepoint() throws Exception { final int sourceParallelism = 4; final int numPravegaSegments = 4; final int numElements = NUM_STREAM_ELEMENTS; // set up the stream final String streamName = RandomStringUtils.randomAlphabetic(20); SETUP_UTILS.createTestStream(streamName, numPravegaSegments); // we create two independent Flink jobs (that come from the same program) final JobGraph program1 = getFlinkJob(sourceParallelism, streamName, numElements); try ( final EventStreamWriter<Integer> eventWriter = SETUP_UTILS.getIntegerWriter(streamName); // create the producer that writes to the stream final ThrottledIntegerWriter producer = new ThrottledIntegerWriter( eventWriter, numElements, numElements / 2, // the latest when the thread must be un-throttled 1, // the initial sleep time per element false ) ) { // the object on which we block while waiting for the checkpoint completion final OneShotLatch sync = new OneShotLatch(); NotifyingMapper.TO_CALL_ON_COMPLETION.set( sync::trigger ); // launch the Flink program from a separate thread final CheckedThread flinkRunner = new CheckedThread() { @Override public void go() throws Exception { MINI_CLUSTER.submitJob(program1); } }; producer.start(); flinkRunner.start(); // wait until at least one checkpoint is complete before triggering the safepoints sync.await(); // now that we are comfortably into the program, trigger a savepoint String savepointPath = null; // since with the short timeouts we configure in these tests, Pravega Checkpoints // sometimes don't complete in time, we retry a bit here for (int attempt = 1; savepointPath == null && attempt <= 5; attempt++) { savepointPath = MINI_CLUSTER.triggerSavepoint(program1.getJobID(), tmpFolder.newFolder().getAbsolutePath(), false).get(); } assertNotNull("Failed to trigger a savepoint", savepointPath); // now cancel the job and relaunch a new one MINI_CLUSTER.cancelJob(program1.getJobID()); try { // this throws an exception that the job was cancelled flinkRunner.sync(); } catch (JobCancellationException ignored) { } producer.unthrottle(); // now, resume with a new program final JobGraph program2 = getFlinkJob(sourceParallelism, streamName, numElements); program2.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false)); // if these calls complete without exception, then the test passes try { MINI_CLUSTER.executeJobBlocking(program2); } catch (Exception e) { if (!(ExceptionUtils.getRootCause(e) instanceof SuccessException)) { throw e; } } } }