-
Notifications
You must be signed in to change notification settings - Fork 623
✨ add ttlSeconds field to RayClusterSpec #4036
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,12 @@ type RayClusterSpec struct { | |
// +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'" | ||
// +optional | ||
ManagedBy *string `json:"managedBy,omitempty"` | ||
// TTLSeconds specifies the time-to-live for the RayCluster in seconds. | ||
// After this time has elapsed since the RayCluster was created, the cluster will be automatically deleted. | ||
// If not specified or set to 0, the cluster will not have a TTL. | ||
// +kubebuilder:default:=0 | ||
// +optional | ||
TTLSeconds *int32 `json:"ttlSeconds,omitempty"` | ||
// AutoscalerOptions specifies optional configuration for the Ray autoscaler. | ||
// +optional | ||
AutoscalerOptions *AutoscalerOptions `json:"autoscalerOptions,omitempty"` | ||
|
@@ -217,6 +223,11 @@ type RayClusterStatus struct { | |
// LastUpdateTime indicates last update timestamp for this cluster status. | ||
// +nullable | ||
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` | ||
// TTLExpirationTime indicates when the RayCluster should be deleted based on its TTL. | ||
// This field is set when TTLSeconds is specified in the RayClusterSpec. | ||
// +nullable | ||
// +optional | ||
TTLExpirationTime *metav1.Time `json:"ttlExpirationTime,omitempty"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure we need this, can we just use creationTimestamp + ttlSeconds? Or is the TTL timer start once the RayCluster is ready? |
||
// StateTransitionTimes indicates the time of the last state transition for each state. | ||
// +optional | ||
StateTransitionTimes map[ClusterState]*metav1.Time `json:"stateTransitionTimes,omitempty"` | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# This sample demonstrates RayCluster with TTL (Time-To-Live) functionality. | ||
# The cluster will be automatically deleted after the specified TTL period. | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
metadata: | ||
name: raycluster-ttl-example | ||
spec: | ||
# TTL in seconds - cluster will be deleted after this time has elapsed since creation | ||
# In this example, the cluster will be deleted after 1 hour (3600 seconds) | ||
# Set to 0 or omit this field to disable TTL functionality | ||
ttlSeconds: 3600 | ||
|
||
# Ray version | ||
rayVersion: '2.46.0' | ||
|
||
# Enable autoscaling | ||
enableInTreeAutoscaling: true | ||
|
||
# Head group specification | ||
headGroupSpec: | ||
# Head group template | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: rayproject/ray:2.46.0 | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
resources: | ||
limits: | ||
cpu: "2" | ||
memory: "4Gi" | ||
requests: | ||
cpu: "2" | ||
memory: "4Gi" | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
# Service type for the head group | ||
serviceType: ClusterIP | ||
|
||
# Worker group specifications | ||
workerGroupSpecs: | ||
# the pod replicas in this group typed worker | ||
- replicas: 2 | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
# logical group name, for this called small-group, also can be functional | ||
groupName: small-group | ||
# The `rayStartParams` are used to configure the `ray start` command. | ||
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of KubeRay. | ||
rayStartParams: {} | ||
#pod template | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: rayproject/ray:2.46.0 | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh","-c","ray stop"] | ||
resources: | ||
limits: | ||
cpu: "1" | ||
memory: "2Gi" | ||
requests: | ||
cpu: "1" | ||
memory: "2Gi" | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -299,6 +299,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance | |
r.reconcileHeadService, | ||
r.reconcileHeadlessService, | ||
r.reconcileServeService, | ||
r.reconcileTTL, | ||
r.reconcilePods, | ||
} | ||
|
||
|
@@ -348,8 +349,22 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance | |
) | ||
requeueAfterSeconds = utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS | ||
} | ||
logger.Info("Unconditional requeue after", "seconds", requeueAfterSeconds) | ||
return ctrl.Result{RequeueAfter: time.Duration(requeueAfterSeconds) * time.Second}, nil | ||
// Check if TTL expiration requires sooner requeue | ||
requeueDuration := time.Duration(requeueAfterSeconds) * time.Second | ||
if newInstance.Spec.TTLSeconds != nil && *newInstance.Spec.TTLSeconds > 0 && newInstance.Status.TTLExpirationTime != nil { | ||
timeUntilExpiration := time.Until(newInstance.Status.TTLExpirationTime.Time) | ||
// Add a small buffer (2 seconds) to ensure we process the deletion promptly | ||
ttlRequeue := timeUntilExpiration + (2 * time.Second) | ||
|
||
// Only use TTL requeue if it's sooner than default requeue and positive | ||
if ttlRequeue > 0 && ttlRequeue < requeueDuration { | ||
requeueDuration = ttlRequeue | ||
logger.Info("Using TTL-based requeue", "ttlRequeueSeconds", ttlRequeue.Seconds(), "defaultRequeueSeconds", requeueAfterSeconds) | ||
} | ||
} | ||
|
||
logger.Info("Unconditional requeue after", "seconds", requeueDuration.Seconds()) | ||
return ctrl.Result{RequeueAfter: requeueDuration}, nil | ||
} | ||
|
||
func (r *RayClusterReconciler) reconcileIngress(ctx context.Context, instance *rayv1.RayCluster) error { | ||
|
@@ -1593,3 +1608,60 @@ func setDefaults(instance *rayv1.RayCluster) { | |
} | ||
} | ||
} | ||
|
||
// reconcileTTL handles the TTL (Time-To-Live) functionality for RayCluster | ||
func (r *RayClusterReconciler) reconcileTTL(ctx context.Context, instance *rayv1.RayCluster) error { | ||
logger := ctrl.LoggerFrom(ctx) | ||
|
||
// If TTLSeconds is not set or is 0, no TTL functionality is needed | ||
if instance.Spec.TTLSeconds == nil || *instance.Spec.TTLSeconds == 0 { | ||
// If TTL was previously set but now removed, clear the expiration time | ||
if instance.Status.TTLExpirationTime != nil { | ||
instance.Status.TTLExpirationTime = nil | ||
} | ||
return nil | ||
} | ||
|
||
ttlSeconds := *instance.Spec.TTLSeconds | ||
creationTime := instance.CreationTimestamp.Time | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should expiration time be from RayCluster creation or RayCluster ready state? |
||
expirationTime := creationTime.Add(time.Duration(ttlSeconds) * time.Second) | ||
|
||
// Update the TTL expiration time in status if it's not set or different | ||
if instance.Status.TTLExpirationTime == nil || !instance.Status.TTLExpirationTime.Time.Equal(expirationTime) { | ||
instance.Status.TTLExpirationTime = &metav1.Time{Time: expirationTime} | ||
logger.Info("TTL expiration time set", "expirationTime", expirationTime, "ttlSeconds", ttlSeconds) | ||
} | ||
|
||
nowTime := time.Now() | ||
|
||
// Check if the TTL has expired | ||
if expirationTime.Before(nowTime) || expirationTime.Equal(nowTime) { | ||
logger.Info("RayCluster TTL has expired, deleting cluster", | ||
"creationTime", creationTime, | ||
"expirationTime", expirationTime, | ||
"currentTime", nowTime, | ||
"ttlSeconds", ttlSeconds) | ||
|
||
// Delete the RayCluster | ||
if err := r.Client.Delete(ctx, instance); err != nil { | ||
logger.Error(err, "Failed to delete RayCluster due to TTL expiration") | ||
return err | ||
} | ||
|
||
r.Recorder.Eventf(instance, corev1.EventTypeNormal, "TTLExpired", | ||
"RayCluster %s/%s deleted due to TTL expiration after %d seconds", | ||
instance.Namespace, instance.Name, ttlSeconds) | ||
|
||
return nil | ||
} | ||
|
||
// Calculate time until expiration and log it | ||
timeUntilExpiration := time.Until(expirationTime) | ||
logger.V(1).Info("RayCluster TTL status", | ||
"creationTime", creationTime, | ||
"expirationTime", expirationTime, | ||
"timeUntilExpiration", timeUntilExpiration, | ||
"ttlSeconds", ttlSeconds) | ||
|
||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
drive-by
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
Just a friendly reminder that the auto-generated docs and Helm chart might need to be updated as well. You can usually regenerate them by running the commands in https://github.com/ray-project/kuberay/blob/master/ray-operator/DEVELOPMENT.md#consistency-check.