diff --git a/docs/monitoring.md b/docs/monitoring.md index 5679ef548a5..39ab314ea57 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -235,6 +235,7 @@ These metrics are exposed by Celeborn worker. | CommitFilesFailCount | The count of commit files request failed in current worker. | | SlotsAllocated | Slots allocated in last hour. | | ActiveSlotsCount | The number of slots currently being used in a worker. | + | AvailableSlotsCount | The number of slots currently available in a worker. | | ReserveSlotsTime | ReserveSlots means acquire a disk buffer and record partition location. | | ActiveConnectionCount | The count of active network connection. | | NettyMemory | The total amount of off-heap memory used by celeborn worker. | diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala index 1c5bc201803..61e8067f565 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala @@ -465,6 +465,9 @@ private[celeborn] class Worker( workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () => workerInfo.usedSlots() } + workerSource.addGauge(WorkerSource.AVAILABLE_SLOTS_COUNT) { () => + workerInfo.totalAvailableSlots() + } workerSource.addGauge(WorkerSource.IS_DECOMMISSIONING_WORKER) { () => if (shutdown.get() && (workerStatusManager.currentWorkerStatus.getState == State.InDecommission || workerStatusManager.currentWorkerStatus.getState == State.InDecommissionThenIdle)) { diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala index f9385805146..b70d8860ee7 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala @@ -238,6 +238,7 @@ object WorkerSource { // slots val SLOTS_ALLOCATED = "SlotsAllocated" val ACTIVE_SLOTS_COUNT = "ActiveSlotsCount" + val AVAILABLE_SLOTS_COUNT = "AvailableSlotsCount" val RESERVE_SLOTS_TIME = "ReserveSlotsTime" // connection