Skip to content

Commit 2f5655a

Browse files
gaoyunhaiidawidwys
authored andcommitted
[FLINK-21080][runtime][checkpoint] Report latest completed checkpoint id when notifying checkpoint abort
This closes apache#16633
1 parent 0182cd3 commit 2f5655a

File tree

22 files changed

+246
-33
lines changed

22 files changed

+246
-33
lines changed

flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1314,14 +1314,18 @@ private void sendAcknowledgeMessages(
13141314

13151315
private void sendAbortedMessages(
13161316
List<ExecutionVertex> tasksToAbort, long checkpointId, long timeStamp) {
1317+
assert (Thread.holdsLock(lock));
1318+
long latestCompletedCheckpointId = completedCheckpointStore.getLatestCheckpointId();
1319+
13171320
// send notification of aborted checkpoints asynchronously.
13181321
executor.execute(
13191322
() -> {
13201323
// send the "abort checkpoint" messages to necessary vertices.
13211324
for (ExecutionVertex ev : tasksToAbort) {
13221325
Execution ee = ev.getCurrentExecutionAttempt();
13231326
if (ee != null) {
1324-
ee.notifyCheckpointAborted(checkpointId, timeStamp);
1327+
ee.notifyCheckpointAborted(
1328+
checkpointId, latestCompletedCheckpointId, timeStamp);
13251329
}
13261330
}
13271331
});

flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CompletedCheckpointStore.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,21 @@ default CompletedCheckpoint getLatestCheckpoint(boolean isPreferCheckpointForRec
8585
return lastCompleted;
8686
}
8787

88+
/** Returns the id of the latest completed checkpoints. */
89+
default long getLatestCheckpointId() {
90+
try {
91+
List<CompletedCheckpoint> allCheckpoints = getAllCheckpoints();
92+
if (allCheckpoints.isEmpty()) {
93+
return 0;
94+
}
95+
96+
return allCheckpoints.get(allCheckpoints.size() - 1).getCheckpointID();
97+
} catch (Throwable throwable) {
98+
LOG.warn("Get the latest completed checkpoints failed", throwable);
99+
return 0;
100+
}
101+
}
102+
88103
/**
89104
* Shuts down the store.
90105
*

flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/Execution.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -796,16 +796,22 @@ public void notifyCheckpointComplete(long checkpointId, long timestamp) {
796796
* Notify the task of this execution about a aborted checkpoint.
797797
*
798798
* @param abortCheckpointId of the subsumed checkpoint
799+
* @param latestCompletedCheckpointId of the latest completed checkpoint
799800
* @param timestamp of the subsumed checkpoint
800801
*/
801-
public void notifyCheckpointAborted(long abortCheckpointId, long timestamp) {
802+
public void notifyCheckpointAborted(
803+
long abortCheckpointId, long latestCompletedCheckpointId, long timestamp) {
802804
final LogicalSlot slot = assignedResource;
803805

804806
if (slot != null) {
805807
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
806808

807809
taskManagerGateway.notifyCheckpointAborted(
808-
attemptId, getVertex().getJobId(), abortCheckpointId, timestamp);
810+
attemptId,
811+
getVertex().getJobId(),
812+
abortCheckpointId,
813+
latestCompletedCheckpointId,
814+
timestamp);
809815
} else {
810816
LOG.debug(
811817
"The execution has no slot assigned. This indicates that the execution is "

flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/tasks/AbstractInvokable.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,11 @@ public Future<Void> notifyCheckpointCompleteAsync(long checkpointId) {
282282
* notification.
283283
*
284284
* @param checkpointId The ID of the checkpoint that is aborted.
285+
* @param latestCompletedCheckpointId The ID of the latest completed checkpoint.
285286
* @return future that completes when the notification has been processed by the task.
286287
*/
287-
public Future<Void> notifyCheckpointAbortAsync(long checkpointId) {
288+
public Future<Void> notifyCheckpointAbortAsync(
289+
long checkpointId, long latestCompletedCheckpointId) {
288290
throw new UnsupportedOperationException(
289291
String.format(
290292
"notifyCheckpointAbortAsync not supported by %s",

flink-runtime/src/main/java/org/apache/flink/runtime/jobmanager/slots/TaskManagerGateway.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,15 @@ void notifyCheckpointComplete(
102102
* @param executionAttemptID identifying the task
103103
* @param jobId identifying the job to which the task belongs
104104
* @param checkpointId of the subsumed checkpoint
105+
* @param latestCompletedCheckpointId of the latest completed checkpoint
105106
* @param timestamp of the subsumed checkpoint
106107
*/
107108
void notifyCheckpointAborted(
108-
ExecutionAttemptID executionAttemptID, JobID jobId, long checkpointId, long timestamp);
109+
ExecutionAttemptID executionAttemptID,
110+
JobID jobId,
111+
long checkpointId,
112+
long latestCompletedCheckpointId,
113+
long timestamp);
109114

110115
/**
111116
* Trigger for the given task a checkpoint.

flink-runtime/src/main/java/org/apache/flink/runtime/jobmaster/RpcTaskManagerGateway.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,13 @@ public void notifyCheckpointComplete(
8787

8888
@Override
8989
public void notifyCheckpointAborted(
90-
ExecutionAttemptID executionAttemptID, JobID jobId, long checkpointId, long timestamp) {
91-
taskExecutorGateway.abortCheckpoint(executionAttemptID, checkpointId, timestamp);
90+
ExecutionAttemptID executionAttemptID,
91+
JobID jobId,
92+
long checkpointId,
93+
long latestCompletedCheckpointId,
94+
long timestamp) {
95+
taskExecutorGateway.abortCheckpoint(
96+
executionAttemptID, checkpointId, latestCompletedCheckpointId, timestamp);
9297
}
9398

9499
@Override

flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskExecutor.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,7 +1000,10 @@ public CompletableFuture<Acknowledge> confirmCheckpoint(
10001000

10011001
@Override
10021002
public CompletableFuture<Acknowledge> abortCheckpoint(
1003-
ExecutionAttemptID executionAttemptID, long checkpointId, long checkpointTimestamp) {
1003+
ExecutionAttemptID executionAttemptID,
1004+
long checkpointId,
1005+
long latestCompletedCheckpointId,
1006+
long checkpointTimestamp) {
10041007
log.debug(
10051008
"Abort checkpoint {}@{} for {}.",
10061009
checkpointId,
@@ -1010,7 +1013,7 @@ public CompletableFuture<Acknowledge> abortCheckpoint(
10101013
final Task task = taskSlotTable.getTask(executionAttemptID);
10111014

10121015
if (task != null) {
1013-
task.notifyCheckpointAborted(checkpointId);
1016+
task.notifyCheckpointAborted(checkpointId, latestCompletedCheckpointId);
10141017

10151018
return CompletableFuture.completedFuture(Acknowledge.get());
10161019
} else {

flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskExecutorGateway.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,15 @@ CompletableFuture<Acknowledge> confirmCheckpoint(
156156
*
157157
* @param executionAttemptID identifying the task
158158
* @param checkpointId unique id for the checkpoint
159+
* @param latestCompletedCheckpointId the id of the latest completed checkpoint
159160
* @param checkpointTimestamp is the timestamp when the checkpoint has been initiated
160161
* @return Future acknowledge if the checkpoint has been successfully confirmed
161162
*/
162163
CompletableFuture<Acknowledge> abortCheckpoint(
163-
ExecutionAttemptID executionAttemptID, long checkpointId, long checkpointTimestamp);
164+
ExecutionAttemptID executionAttemptID,
165+
long checkpointId,
166+
long latestCompletedCheckpointId,
167+
long checkpointTimestamp);
164168

165169
/**
166170
* Cancel the given task.

flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskExecutorGatewayDecoratorBase.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,12 @@ public CompletableFuture<Acknowledge> confirmCheckpoint(
138138

139139
@Override
140140
public CompletableFuture<Acknowledge> abortCheckpoint(
141-
ExecutionAttemptID executionAttemptID, long checkpointId, long checkpointTimestamp) {
141+
ExecutionAttemptID executionAttemptID,
142+
long checkpointId,
143+
long latestCompletedCheckpointId,
144+
long checkpointTimestamp) {
142145
return originalGateway.abortCheckpoint(
143-
executionAttemptID, checkpointId, checkpointTimestamp);
146+
executionAttemptID, checkpointId, latestCompletedCheckpointId, checkpointTimestamp);
144147
}
145148

146149
@Override

flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.java

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.apache.flink.api.common.JobID;
2424
import org.apache.flink.api.common.TaskInfo;
2525
import org.apache.flink.api.common.cache.DistributedCache;
26-
import org.apache.flink.api.common.state.CheckpointListener;
2726
import org.apache.flink.configuration.Configuration;
2827
import org.apache.flink.configuration.TaskManagerOptions;
2928
import org.apache.flink.core.fs.FileSystemSafetyNet;
@@ -137,11 +136,7 @@
137136
* <p>Each Task is run by one dedicated thread.
138137
*/
139138
public class Task
140-
implements Runnable,
141-
TaskSlotPayload,
142-
TaskActions,
143-
PartitionProducerStateProvider,
144-
CheckpointListener {
139+
implements Runnable, TaskSlotPayload, TaskActions, PartitionProducerStateProvider {
145140

146141
/** The class logger. */
147142
private static final Logger LOG = LoggerFactory.getLogger(Task.class);
@@ -1356,7 +1351,6 @@ public void triggerCheckpointBarrier(
13561351
}
13571352
}
13581353

1359-
@Override
13601354
public void notifyCheckpointComplete(final long checkpointID) {
13611355
final AbstractInvokable invokable = this.invokable;
13621356

@@ -1384,13 +1378,13 @@ public void notifyCheckpointComplete(final long checkpointID) {
13841378
}
13851379
}
13861380

1387-
@Override
1388-
public void notifyCheckpointAborted(final long checkpointID) {
1381+
public void notifyCheckpointAborted(
1382+
final long checkpointID, final long latestCompletedCheckpointId) {
13891383
final AbstractInvokable invokable = this.invokable;
13901384

13911385
if (executionState == ExecutionState.RUNNING && invokable != null) {
13921386
try {
1393-
invokable.notifyCheckpointAbortAsync(checkpointID);
1387+
invokable.notifyCheckpointAbortAsync(checkpointID, latestCompletedCheckpointId);
13941388
} catch (RejectedExecutionException ex) {
13951389
// This may happen if the mailbox is closed. It means that the task is shutting
13961390
// down, so we just ignore it.

0 commit comments

Comments
 (0)