org.apache.flink.runtime.jobgraph.JobGraph#setSavepointRestoreSettings

Source File: ClusterClient.java From flink with Apache License 2.0

6 votes

public static JobGraph getJobGraph(Configuration flinkConfig, FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, SavepointRestoreSettings savepointSettings) {
	JobGraph job;
	if (optPlan instanceof StreamingPlan) {
		job = ((StreamingPlan) optPlan).getJobGraph();
		job.setSavepointRestoreSettings(savepointSettings);
	} else {
		JobGraphGenerator gen = new JobGraphGenerator(flinkConfig);
		job = gen.compileJobGraph((OptimizedPlan) optPlan);
	}

	for (URL jar : jarFiles) {
		try {
			job.addJar(new Path(jar.toURI()));
		} catch (URISyntaxException e) {
			throw new RuntimeException("URL is invalid. This should not happen.", e);
		}
	}

	job.setClasspaths(classpaths);

	return job;
}

Source File: ClassPathJobGraphRetriever.java From flink with Apache License 2.0

6 votes

@Override
public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException {
	final PackagedProgram packagedProgram = createPackagedProgram();
	final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM);
	try {
		final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(
			packagedProgram,
			configuration,
			defaultParallelism,
			jobId);
		jobGraph.setAllowQueuedScheduling(true);
		jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

		return jobGraph;
	} catch (Exception e) {
		throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e);
	}
}

Source File: JobMasterTest.java From flink with Apache License 2.0

6 votes

@Nonnull
private JobGraph createJobGraphFromJobVerticesWithCheckpointing(SavepointRestoreSettings savepointRestoreSettings, JobVertex... jobVertices) {
	final JobGraph jobGraph = new JobGraph(jobVertices);

	// enable checkpointing which is required to resume from a savepoint
	final CheckpointCoordinatorConfiguration checkpoinCoordinatorConfiguration = new CheckpointCoordinatorConfiguration(
		1000L,
		1000L,
		1000L,
		1,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		true,
		false,
		0);
	final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(
		Collections.emptyList(),
		Collections.emptyList(),
		Collections.emptyList(),
		checkpoinCoordinatorConfiguration,
		null);
	jobGraph.setSnapshotSettings(checkpointingSettings);
	jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

	return jobGraph;
}

Source File: DFCusterClient.java From df_data_service with Apache License 2.0

6 votes

private JobGraph getJobGraph(FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, SavepointRestoreSettings savepointSettings) {
	JobGraph job;
	if (optPlan instanceof StreamingPlan) {
		job = ((StreamingPlan) optPlan).getJobGraph();
		job.setSavepointRestoreSettings(savepointSettings);
	} else {
		JobGraphGenerator gen = new JobGraphGenerator(this.flinkConfig);
		job = gen.compileJobGraph((OptimizedPlan) optPlan);
	}

	for (URL jar : jarFiles) {
		try {
			job.addJar(new Path(jar.toURI()));
		} catch (URISyntaxException e) {
			throw new RuntimeException("URL is invalid. This should not happen.", e);
		}
	}

	job.setClasspaths(classpaths);

	return job;
}

Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0

6 votes

private void restoreJob(ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception {
	JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE);
	jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState));

	assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID());

	ClientUtils.submitJob(clusterClient, jobToRestore);

	CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay(
		() -> clusterClient.getJobStatus(jobToRestore.getJobID()),
		Time.milliseconds(50),
		deadline,
		(jobStatus) -> jobStatus == JobStatus.FINISHED,
		TestingUtils.defaultScheduledExecutor());
	assertEquals(
		JobStatus.FINISHED,
		jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
}

Source File: JobMasterTest.java From Flink-CEPplus with Apache License 2.0

6 votes

@Nonnull
private JobGraph createJobGraphFromJobVerticesWithCheckpointing(SavepointRestoreSettings savepointRestoreSettings, JobVertex... jobVertices) {
	final JobGraph jobGraph = new JobGraph(jobVertices);

	// enable checkpointing which is required to resume from a savepoint
	final CheckpointCoordinatorConfiguration checkpoinCoordinatorConfiguration = new CheckpointCoordinatorConfiguration(
		1000L,
		1000L,
		1000L,
		1,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		true);
	final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(
		Collections.emptyList(),
		Collections.emptyList(),
		Collections.emptyList(),
		checkpoinCoordinatorConfiguration,
		null);
	jobGraph.setSnapshotSettings(checkpointingSettings);
	jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

	return jobGraph;
}

Source File: ClassPathJobGraphRetriever.java From Flink-CEPplus with Apache License 2.0

6 votes

@Override
public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException {
	final PackagedProgram packagedProgram = createPackagedProgram();
	final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM);
	try {
		final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(
			packagedProgram,
			configuration,
			defaultParallelism,
			jobId);
		jobGraph.setAllowQueuedScheduling(true);
		jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

		return jobGraph;
	} catch (Exception e) {
		throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e);
	}
}

Source File: PipelineExecutorUtils.java From flink with Apache License 2.0

6 votes

/**
 * Creates the {@link JobGraph} corresponding to the provided {@link Pipeline}.
 *
 * @param pipeline the pipeline whose job graph we are computing
 * @param configuration the configuration with the necessary information such as jars and
 *                         classpaths to be included, the parallelism of the job and potential
 *                         savepoint settings used to bootstrap its state.
 * @return the corresponding {@link JobGraph}.
 */
public static JobGraph getJobGraph(@Nonnull final Pipeline pipeline, @Nonnull final Configuration configuration) throws MalformedURLException {
	checkNotNull(pipeline);
	checkNotNull(configuration);

	final ExecutionConfigAccessor executionConfigAccessor = ExecutionConfigAccessor.fromConfiguration(configuration);
	final JobGraph jobGraph = FlinkPipelineTranslationUtil
			.getJobGraph(pipeline, configuration, executionConfigAccessor.getParallelism());

	configuration
			.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID)
			.ifPresent(strJobID -> jobGraph.setJobID(JobID.fromHexString(strJobID)));

	jobGraph.addJars(executionConfigAccessor.getJars());
	jobGraph.setClasspaths(executionConfigAccessor.getClasspaths());
	jobGraph.setSavepointRestoreSettings(executionConfigAccessor.getSavepointRestoreSettings());

	return jobGraph;
}

Source File: StatefulFunctionsJobGraphRetriever.java From stateful-functions with Apache License 2.0

6 votes

@Override
public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException {
  final PackagedProgram packagedProgram = createPackagedProgram();

  final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM);
  try {
    final JobGraph jobGraph =
        PackagedProgramUtils.createJobGraph(
            packagedProgram, configuration, defaultParallelism, jobId, false);
    jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

    return jobGraph;
  } catch (Exception e) {
    throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e);
  }
}

Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0

6 votes

private void restoreJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception {
	JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE);
	jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState));

	assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID());

	clusterClient.submitJob(jobToRestore, classLoader);

	CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay(
		() -> clusterClient.getJobStatus(jobToRestore.getJobID()),
		Time.milliseconds(50),
		deadline,
		(jobStatus) -> jobStatus == JobStatus.FINISHED,
		TestingUtils.defaultScheduledExecutor());
	assertEquals(
		JobStatus.FINISHED,
		jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
}

Source File: SavepointITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: FlinkRequiresStableInputTest.java From beam with Apache License 2.0

5 votes

private JobID restoreFromSavepoint(Pipeline pipeline, String savepointDir)
    throws ExecutionException, InterruptedException {
  JobGraph jobGraph = getJobGraph(pipeline);
  SavepointRestoreSettings savepointSettings = SavepointRestoreSettings.forPath(savepointDir);
  jobGraph.setSavepointRestoreSettings(savepointSettings);
  return flinkCluster.submitJob(jobGraph).get().getJobID();
}

Source File: SavepointITCase.java From flink with Apache License 2.0

5 votes

private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}

Source File: SavepointWriterITCase.java From flink with Apache License 2.0

5 votes

private void validateModification(String savepointPath) throws ProgramInvocationException {
	StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
	sEnv.setStateBackend(backend);

	CollectSink.accountList.clear();

	DataStream<Account> stream = sEnv.fromCollection(accounts)
		.keyBy(acc -> acc.id)
		.flatMap(new UpdateAndGetAccount())
		.uid(ACCOUNT_UID);

	stream.addSink(new CollectSink());

	stream
		.map(acc -> acc.id)
		.map(new StatefulOperator())
		.uid(MODIFY_UID)
		.addSink(new DiscardingSink<>());

	JobGraph jobGraph = sEnv.getStreamGraph().getJobGraph();
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false));

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.submitJob(jobGraph, SavepointWriterITCase.class.getClassLoader());

	Assert.assertEquals("Unexpected output", 3, CollectSink.accountList.size());
}

Source File: JobMasterTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that a JobMaster will only restore a modified JobGraph if non
 * restored state is allowed.
 */
@Test
public void testRestoringModifiedJobFromSavepoint() throws Exception {

	// create savepoint data
	final long savepointId = 42L;
	final OperatorID operatorID = new OperatorID();
	final File savepointFile = createSavepointWithOperatorState(savepointId, operatorID);

	// set savepoint settings which don't allow non restored state
	final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath(
		savepointFile.getAbsolutePath(),
		false);

	// create a new operator
	final JobVertex jobVertex = new JobVertex("New operator");
	jobVertex.setInvokableClass(NoOpInvokable.class);
	final JobGraph jobGraphWithNewOperator = createJobGraphFromJobVerticesWithCheckpointing(savepointRestoreSettings, jobVertex);

	final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
	final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter());
	haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);

	try {
		createJobMaster(
			configuration,
			jobGraphWithNewOperator,
			haServices,
			new TestingJobManagerSharedServicesBuilder().build());
		fail("Should fail because we cannot resume the changed JobGraph from the savepoint.");
	} catch (IllegalStateException expected) {
		// that was expected :-)
	}

	// allow for non restored state
	jobGraphWithNewOperator.setSavepointRestoreSettings(
		SavepointRestoreSettings.forPath(
			savepointFile.getAbsolutePath(),
			true));

	final JobMaster jobMaster = createJobMaster(
		configuration,
		jobGraphWithNewOperator,
		haServices,
		new TestingJobManagerSharedServicesBuilder().build());

	try {
		// starting the JobMaster should have read the savepoint
		final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint();

		assertThat(savepointCheckpoint, Matchers.notNullValue());

		assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId));
	} finally {
		RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
	}
}

Source File: AbstractOperatorRestoreTestBase.java From Flink-CEPplus with Apache License 2.0

4 votes

private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!(exceptionString.matches("(.*\n)*.*savepoint for the job .* failed(.*\n)*") // legacy
						|| exceptionString.matches("(.*\n)*.*was not running(.*\n)*")
						|| exceptionString.matches("(.*\n)*.*Not all required tasks are currently running(.*\n)*") // new
						|| exceptionString.matches("(.*\n)*.*Checkpoint was declined \\(tasks not ready\\)(.*\n)*"))) { // new
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}

Source File: AbstractOperatorRestoreTestBase.java From flink with Apache License 2.0

4 votes

private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!PATTERN_CANCEL_WITH_SAVEPOINT_TOLERATED_EXCEPTIONS.matcher(exceptionString).find()) {
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}

Source File: JobMasterTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a JobMaster will only restore a modified JobGraph if non
 * restored state is allowed.
 */
@Test
public void testRestoringModifiedJobFromSavepoint() throws Exception {

	// create savepoint data
	final long savepointId = 42L;
	final OperatorID operatorID = new OperatorID();
	final File savepointFile = createSavepointWithOperatorState(savepointId, operatorID);

	// set savepoint settings which don't allow non restored state
	final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath(
		savepointFile.getAbsolutePath(),
		false);

	// create a new operator
	final JobVertex jobVertex = new JobVertex("New operator");
	jobVertex.setInvokableClass(NoOpInvokable.class);
	final JobGraph jobGraphWithNewOperator = createJobGraphFromJobVerticesWithCheckpointing(savepointRestoreSettings, jobVertex);

	final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
	final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter());
	haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);

	try {
		createJobMaster(
			configuration,
			jobGraphWithNewOperator,
			haServices,
			new TestingJobManagerSharedServicesBuilder().build());
		fail("Should fail because we cannot resume the changed JobGraph from the savepoint.");
	} catch (IllegalStateException expected) {
		// that was expected :-)
	}

	// allow for non restored state
	jobGraphWithNewOperator.setSavepointRestoreSettings(
		SavepointRestoreSettings.forPath(
			savepointFile.getAbsolutePath(),
			true));

	final JobMaster jobMaster = createJobMaster(
		configuration,
		jobGraphWithNewOperator,
		haServices,
		new TestingJobManagerSharedServicesBuilder().build());

	try {
		// starting the JobMaster should have read the savepoint
		final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint(false);

		assertThat(savepointCheckpoint, Matchers.notNullValue());

		assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId));
	} finally {
		RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
	}
}

Source File: RescalingITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}

Source File: FlinkPravegaReaderSavepointITCase.java From flink-connectors with Apache License 2.0

4 votes

@Test
public void testPravegaWithSavepoint() throws Exception {
    final int sourceParallelism = 4;
    final int numPravegaSegments = 4;
    final int numElements = NUM_STREAM_ELEMENTS;

    // set up the stream
    final String streamName = RandomStringUtils.randomAlphabetic(20);
    SETUP_UTILS.createTestStream(streamName, numPravegaSegments);

    // we create two independent Flink jobs (that come from the same program)
    final JobGraph program1 = getFlinkJob(sourceParallelism, streamName, numElements);

    try (
            final EventStreamWriter<Integer> eventWriter = SETUP_UTILS.getIntegerWriter(streamName);

            // create the producer that writes to the stream
            final ThrottledIntegerWriter producer = new ThrottledIntegerWriter(
                    eventWriter,
                    numElements,
                    numElements / 2,  // the latest when the thread must be un-throttled
                    1,                 // the initial sleep time per element
                    false
            )

    ) {
        // the object on which we block while waiting for the checkpoint completion
        final OneShotLatch sync = new OneShotLatch();
        NotifyingMapper.TO_CALL_ON_COMPLETION.set( sync::trigger );

        // launch the Flink program from a separate thread
        final CheckedThread flinkRunner = new CheckedThread() {
            @Override
            public void go() throws Exception {
                MINI_CLUSTER.submitJob(program1);
            }
        };

        producer.start();
        flinkRunner.start();

        // wait until at least one checkpoint is complete before triggering the safepoints
        sync.await();

        // now that we are comfortably into the program, trigger a savepoint
        String savepointPath = null;

        // since with the short timeouts we configure in these tests, Pravega Checkpoints
        // sometimes don't complete in time, we retry a bit here
        for (int attempt = 1; savepointPath == null && attempt <= 5; attempt++) {
            savepointPath = MINI_CLUSTER.triggerSavepoint(program1.getJobID(), tmpFolder.newFolder().getAbsolutePath(), false).get();
        }

        assertNotNull("Failed to trigger a savepoint", savepointPath);

        // now cancel the job and relaunch a new one
        MINI_CLUSTER.cancelJob(program1.getJobID());

        try {
            // this throws an exception that the job was cancelled
            flinkRunner.sync();
        } catch (JobCancellationException ignored) {
        }

        producer.unthrottle();

        // now, resume with a new program
        final JobGraph program2 = getFlinkJob(sourceParallelism, streamName, numElements);
        program2.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false));

        // if these calls complete without exception, then the test passes
        try {
            MINI_CLUSTER.executeJobBlocking(program2);
        } catch (Exception e) {
            if (!(ExceptionUtils.getRootCause(e) instanceof SuccessException)) {
                throw e;
            }
        }
    }
}

Java Code Examples for org.apache.flink.runtime.jobgraph.JobGraph#setSavepointRestoreSettings()