Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ray-operator/DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ Run tests on your local environment

We use [elastic/crd-ref-docs](https://github.com/elastic/crd-ref-docs) to generate API reference for CRDs of KubeRay. The configuration file of `crd-ref-docs` is located at `hack/config.yaml`. Please refer to the documentation for more details.

Generate API refernece:
Generate API reference:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drive-by

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!
Just a friendly reminder that the auto-generated docs and Helm chart might need to be updated as well. You can usually regenerate them by running the commands in https://github.com/ray-project/kuberay/blob/master/ray-operator/DEVELOPMENT.md#consistency-check.


```bash
make api-docs
Expand Down
11 changes: 11 additions & 0 deletions ray-operator/apis/ray/v1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ type RayClusterSpec struct {
// +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'"
// +optional
ManagedBy *string `json:"managedBy,omitempty"`
// TTLSeconds specifies the time-to-live for the RayCluster in seconds.
// After this time has elapsed since the RayCluster was created, the cluster will be automatically deleted.
// If not specified or set to 0, the cluster will not have a TTL.
// +kubebuilder:default:=0
// +optional
TTLSeconds *int32 `json:"ttlSeconds,omitempty"`
// AutoscalerOptions specifies optional configuration for the Ray autoscaler.
// +optional
AutoscalerOptions *AutoscalerOptions `json:"autoscalerOptions,omitempty"`
Expand Down Expand Up @@ -217,6 +223,11 @@ type RayClusterStatus struct {
// LastUpdateTime indicates last update timestamp for this cluster status.
// +nullable
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
// TTLExpirationTime indicates when the RayCluster should be deleted based on its TTL.
// This field is set when TTLSeconds is specified in the RayClusterSpec.
// +nullable
// +optional
TTLExpirationTime *metav1.Time `json:"ttlExpirationTime,omitempty"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure we need this, can we just use creationTimestamp + ttlSeconds? Or is the TTL timer start once the RayCluster is ready?

// StateTransitionTimes indicates the time of the last state transition for each state.
// +optional
StateTransitionTimes map[ClusterState]*metav1.Time `json:"stateTransitionTimes,omitempty"`
Expand Down
9 changes: 9 additions & 0 deletions ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

83 changes: 83 additions & 0 deletions ray-operator/config/samples/ray-cluster-ttl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# This sample demonstrates RayCluster with TTL (Time-To-Live) functionality.
# The cluster will be automatically deleted after the specified TTL period.
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: raycluster-ttl-example
spec:
# TTL in seconds - cluster will be deleted after this time has elapsed since creation
# In this example, the cluster will be deleted after 1 hour (3600 seconds)
# Set to 0 or omit this field to disable TTL functionality
ttlSeconds: 3600

# Ray version
rayVersion: '2.46.0'

# Enable autoscaling
enableInTreeAutoscaling: true

# Head group specification
headGroupSpec:
# Head group template
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.46.0
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: "2"
memory: "4Gi"
requests:
cpu: "2"
memory: "4Gi"
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
volumes:
- name: ray-logs
emptyDir: {}
# Service type for the head group
serviceType: ClusterIP

# Worker group specifications
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 2
minReplicas: 1
maxReplicas: 5
# logical group name, for this called small-group, also can be functional
groupName: small-group
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of KubeRay.
rayStartParams: {}
#pod template
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.46.0
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "1"
memory: "2Gi"
requests:
cpu: "1"
memory: "2Gi"
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
volumes:
- name: ray-logs
emptyDir: {}
76 changes: 74 additions & 2 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance
r.reconcileHeadService,
r.reconcileHeadlessService,
r.reconcileServeService,
r.reconcileTTL,
r.reconcilePods,
}

Expand Down Expand Up @@ -348,8 +349,22 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance
)
requeueAfterSeconds = utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS
}
logger.Info("Unconditional requeue after", "seconds", requeueAfterSeconds)
return ctrl.Result{RequeueAfter: time.Duration(requeueAfterSeconds) * time.Second}, nil
// Check if TTL expiration requires sooner requeue
requeueDuration := time.Duration(requeueAfterSeconds) * time.Second
if newInstance.Spec.TTLSeconds != nil && *newInstance.Spec.TTLSeconds > 0 && newInstance.Status.TTLExpirationTime != nil {
timeUntilExpiration := time.Until(newInstance.Status.TTLExpirationTime.Time)
// Add a small buffer (2 seconds) to ensure we process the deletion promptly
ttlRequeue := timeUntilExpiration + (2 * time.Second)

// Only use TTL requeue if it's sooner than default requeue and positive
if ttlRequeue > 0 && ttlRequeue < requeueDuration {
requeueDuration = ttlRequeue
logger.Info("Using TTL-based requeue", "ttlRequeueSeconds", ttlRequeue.Seconds(), "defaultRequeueSeconds", requeueAfterSeconds)
}
}

logger.Info("Unconditional requeue after", "seconds", requeueDuration.Seconds())
return ctrl.Result{RequeueAfter: requeueDuration}, nil
}

func (r *RayClusterReconciler) reconcileIngress(ctx context.Context, instance *rayv1.RayCluster) error {
Expand Down Expand Up @@ -1593,3 +1608,60 @@ func setDefaults(instance *rayv1.RayCluster) {
}
}
}

// reconcileTTL handles the TTL (Time-To-Live) functionality for RayCluster
func (r *RayClusterReconciler) reconcileTTL(ctx context.Context, instance *rayv1.RayCluster) error {
logger := ctrl.LoggerFrom(ctx)

// If TTLSeconds is not set or is 0, no TTL functionality is needed
if instance.Spec.TTLSeconds == nil || *instance.Spec.TTLSeconds == 0 {
// If TTL was previously set but now removed, clear the expiration time
if instance.Status.TTLExpirationTime != nil {
instance.Status.TTLExpirationTime = nil
}
return nil
}

ttlSeconds := *instance.Spec.TTLSeconds
creationTime := instance.CreationTimestamp.Time
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should expiration time be from RayCluster creation or RayCluster ready state?

expirationTime := creationTime.Add(time.Duration(ttlSeconds) * time.Second)

// Update the TTL expiration time in status if it's not set or different
if instance.Status.TTLExpirationTime == nil || !instance.Status.TTLExpirationTime.Time.Equal(expirationTime) {
instance.Status.TTLExpirationTime = &metav1.Time{Time: expirationTime}
logger.Info("TTL expiration time set", "expirationTime", expirationTime, "ttlSeconds", ttlSeconds)
}

nowTime := time.Now()

// Check if the TTL has expired
if expirationTime.Before(nowTime) || expirationTime.Equal(nowTime) {
logger.Info("RayCluster TTL has expired, deleting cluster",
"creationTime", creationTime,
"expirationTime", expirationTime,
"currentTime", nowTime,
"ttlSeconds", ttlSeconds)

// Delete the RayCluster
if err := r.Client.Delete(ctx, instance); err != nil {
logger.Error(err, "Failed to delete RayCluster due to TTL expiration")
return err
}

r.Recorder.Eventf(instance, corev1.EventTypeNormal, "TTLExpired",
"RayCluster %s/%s deleted due to TTL expiration after %d seconds",
instance.Namespace, instance.Name, ttlSeconds)

return nil
}

// Calculate time until expiration and log it
timeUntilExpiration := time.Until(expirationTime)
logger.V(1).Info("RayCluster TTL status",
"creationTime", creationTime,
"expirationTime", expirationTime,
"timeUntilExpiration", timeUntilExpiration,
"ttlSeconds", ttlSeconds)

return nil
}
4 changes: 4 additions & 0 deletions ray-operator/controllers/ray/utils/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s
}
}

if spec.TTLSeconds != nil && *spec.TTLSeconds < 0 {
return fmt.Errorf("ttlSeconds must be a non-negative integer")
}

if annotations[RayFTEnabledAnnotationKey] != "" && spec.GcsFaultToleranceOptions != nil {
return fmt.Errorf("%s annotation and GcsFaultToleranceOptions are both set. "+
"Please use only GcsFaultToleranceOptions to configure GCS fault tolerance", RayFTEnabledAnnotationKey)
Expand Down
48 changes: 48 additions & 0 deletions ray-operator/controllers/ray/utils/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,54 @@ func TestValidateRayServiceMetadata(t *testing.T) {
require.NoError(t, err)
}

func TestValidateRayClusterSpecTTL(t *testing.T) {
tests := []struct {
name string
ttlSeconds *int32
expectError bool
}{
{
name: "TTLSeconds is nil (not set)",
ttlSeconds: nil,
expectError: false,
},
{
name: "TTLSeconds is 0 (no TTL)",
ttlSeconds: ptr.To(int32(0)),
expectError: false,
},
{
name: "TTLSeconds is positive",
ttlSeconds: ptr.To(int32(3600)),
expectError: false,
},
{
name: "TTLSeconds is negative",
ttlSeconds: ptr.To(int32(-1)),
expectError: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
spec := &rayv1.RayClusterSpec{
TTLSeconds: tt.ttlSeconds,
HeadGroupSpec: rayv1.HeadGroupSpec{
Template: podTemplateSpec(nil, nil),
},
}

err := ValidateRayClusterSpec(spec, map[string]string{})
if tt.expectError {
require.Error(t, err)
require.Contains(t, err.Error(), "ttlSeconds must be a non-negative integer")
} else {
require.NoError(t, err)
}
})
}
}

func createBasicRayClusterSpec() *rayv1.RayClusterSpec {
return &rayv1.RayClusterSpec{
HeadGroupSpec: rayv1.HeadGroupSpec{
Expand Down