Skip to content

Commit 9e172e2

Browse files
committed
fix
1 parent c4172e0 commit 9e172e2

File tree

4 files changed

+125
-67
lines changed

4 files changed

+125
-67
lines changed

master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/AbstractMetaManager.java

+15
Original file line numberDiff line numberDiff line change
@@ -610,4 +610,19 @@ private void addShuffleFallbackCounts(Map<String, Long> fallbackCounts) {
610610
fallbackPolicy, (k, v) -> v == null ? fallbackCounts.get(k) : v + fallbackCounts.get(k));
611611
}
612612
}
613+
614+
public void updateWorkerResourceConsumptions(
615+
String host,
616+
int rpcPort,
617+
int pushPort,
618+
int fetchPort,
619+
int replicatePort,
620+
Map<UserIdentifier, ResourceConsumption> resourceConsumptions) {
621+
WorkerInfo worker =
622+
new WorkerInfo(host, rpcPort, pushPort, fetchPort, replicatePort, -1, null, null);
623+
synchronized (workersMap) {
624+
Optional<WorkerInfo> workerInfo = Optional.ofNullable(workersMap.get(worker.toUniqueId()));
625+
workerInfo.ifPresent(info -> info.updateThenGetUserResourceConsumption(resourceConsumptions));
626+
}
627+
}
613628
}

master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala

+8-5
Original file line numberDiff line numberDiff line change
@@ -186,11 +186,8 @@ private[celeborn] class Master(
186186
private val hasHDFSStorage = conf.hasHDFSStorage
187187
private val hasS3Storage = conf.hasS3Storage
188188

189-
// workerUniqueId -> ResourceConsumption
190-
private val workerToResourceConsumptions =
191-
JavaUtils.newConcurrentHashMap[String, util.Map[UserIdentifier, ResourceConsumption]]()
192189
private val quotaManager = new QuotaManager(
193-
workerToResourceConsumptions,
190+
statusSystem,
194191
masterSource,
195192
resourceConsumptionSource,
196193
conf,
@@ -675,9 +672,15 @@ private[celeborn] class Master(
675672
highWorkload,
676673
workerStatus,
677674
requestId)
675+
statusSystem.updateWorkerResourceConsumptions(
676+
host,
677+
rpcPort,
678+
pushPort,
679+
fetchPort,
680+
replicatePort,
681+
userResourceConsumption)
678682
}
679683

680-
workerToResourceConsumptions.put(targetWorker.toUniqueId(), userResourceConsumption)
681684
val expiredShuffleKeys = new util.HashSet[String]
682685
activeShuffleKeys.asScala.foreach { shuffleKey =>
683686
val (appId, shuffleId) = Utils.splitShuffleKey(shuffleKey)

master/src/main/scala/org/apache/celeborn/service/deploy/master/quota/QuotaManager.scala

+62-59
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ import org.apache.celeborn.common.util.{JavaUtils, ThreadUtils, Utils}
3232
import org.apache.celeborn.server.common.service.config.ConfigService
3333
import org.apache.celeborn.service.deploy.master.MasterSource
3434
import org.apache.celeborn.service.deploy.master.MasterSource.UPDATE_RESOURCE_CONSUMPTION_TIME
35+
import org.apache.celeborn.service.deploy.master.clustermeta.AbstractMetaManager
3536
import org.apache.celeborn.service.deploy.master.quota.QuotaStatus._
3637

3738
class QuotaManager(
38-
workerToResourceConsumptions: JMap[String, JMap[UserIdentifier, ResourceConsumption]],
39+
statusSystem: AbstractMetaManager,
3940
masterSource: MasterSource,
4041
resourceConsumptionSource: ResourceConsumptionSource,
4142
celebornConf: CelebornConf,
@@ -189,77 +190,79 @@ class QuotaManager(
189190
masterSource.sample(UPDATE_RESOURCE_CONSUMPTION_TIME, this.getClass.getSimpleName, Map.empty) {
190191
val clusterQuota = getClusterStorageQuota
191192
var clusterResourceConsumption = ResourceConsumption(0, 0, 0, 0)
193+
192194
val tenantResourceConsumption =
193-
workerToResourceConsumptions.asScala.flatMap(_._2.asScala).groupBy(_._1.tenantId).map {
194-
case (tenantId, tenantConsumptionList) =>
195-
var tenantResourceConsumption = ResourceConsumption(0, 0, 0, 0)
196-
val userResourceConsumption =
197-
tenantConsumptionList.groupBy(_._1).map {
198-
case (userIdentifier, userConsumptionList) =>
199-
// Step 1: Compute user consumption and set quota status.
200-
val resourceConsumptionList = userConsumptionList.values.toSeq
201-
val resourceConsumption = computeUserResourceConsumption(resourceConsumptionList)
195+
statusSystem.availableWorkers.asScala.flatMap { workerInfo =>
196+
workerInfo.userResourceConsumption.asScala
197+
}.groupBy(_._1.tenantId).toSeq.map { case (tenantId, tenantConsumptionList) =>
198+
var tenantResourceConsumption = ResourceConsumption(0, 0, 0, 0)
199+
val userResourceConsumption =
200+
tenantConsumptionList.groupBy(_._1).map {
201+
case (userIdentifier, userConsumptionList) =>
202+
// Step 1: Compute user consumption and set quota status.
203+
val resourceConsumptionList = userConsumptionList.map(_._2).toSeq
204+
val resourceConsumption = computeUserResourceConsumption(resourceConsumptionList)
202205

203-
// Step 2: Update user resource consumption metrics.
204-
// For extract metrics
205-
userResourceConsumptionMap.put(userIdentifier, resourceConsumption)
206-
registerUserResourceConsumptionMetrics(userIdentifier)
206+
// Step 2: Update user resource consumption metrics.
207+
// For extract metrics
208+
userResourceConsumptionMap.put(userIdentifier, resourceConsumption)
209+
registerUserResourceConsumptionMetrics(userIdentifier)
207210

208-
// Step 3: Expire user level exceeded app except already expired app
209-
clusterResourceConsumption = clusterResourceConsumption.add(resourceConsumption)
210-
tenantResourceConsumption = tenantResourceConsumption.add(resourceConsumption)
211-
val quotaStatus = checkUserQuotaSpace(userIdentifier, resourceConsumption)
212-
userQuotaStatus.put(userIdentifier, quotaStatus)
213-
if (interruptShuffleEnabled && quotaStatus.exceed) {
214-
val subResourceConsumptions = computeSubConsumption(resourceConsumptionList)
215-
// Compute expired size
216-
val (expired, notExpired) = subResourceConsumptions.partition { case (app, _) =>
217-
appQuotaStatus.containsKey(app)
218-
}
219-
val userConsumptions =
220-
expired.values.foldLeft(resourceConsumption)(_.subtract(_))
221-
expireApplication(
222-
userConsumptions,
223-
getUserStorageQuota(userIdentifier),
224-
notExpired.toSeq,
225-
USER_EXHAUSTED)
226-
(Option(subResourceConsumptions), resourceConsumptionList)
227-
} else {
228-
(None, resourceConsumptionList)
211+
// Step 3: Expire user level exceeded app except already expired app
212+
clusterResourceConsumption = clusterResourceConsumption.add(resourceConsumption)
213+
tenantResourceConsumption = tenantResourceConsumption.add(resourceConsumption)
214+
val quotaStatus = checkUserQuotaSpace(userIdentifier, resourceConsumption)
215+
userQuotaStatus.put(userIdentifier, quotaStatus)
216+
if (interruptShuffleEnabled && quotaStatus.exceed) {
217+
val subResourceConsumptions = computeSubConsumption(resourceConsumptionList)
218+
// Compute expired size
219+
val (expired, notExpired) = subResourceConsumptions.partition { case (app, _) =>
220+
appQuotaStatus.containsKey(app)
229221
}
230-
}
222+
val userConsumptions =
223+
expired.values.foldLeft(resourceConsumption)(_.subtract(_))
224+
expireApplication(
225+
userConsumptions,
226+
getUserStorageQuota(userIdentifier),
227+
notExpired.toSeq,
228+
USER_EXHAUSTED)
229+
(Option(subResourceConsumptions), resourceConsumptionList)
230+
} else {
231+
(None, resourceConsumptionList)
232+
}
233+
}
231234

232-
val quotaStatus = checkTenantQuotaSpace(tenantId, tenantResourceConsumption)
233-
tenantQuotaStatus.put(tenantId, quotaStatus)
234-
// Expire tenant level exceeded app except already expired app
235-
if (interruptShuffleEnabled && quotaStatus.exceed) {
236-
val appConsumptions = userResourceConsumption.map {
237-
case (None, subConsumptionList) => computeSubConsumption(subConsumptionList)
238-
case (Some(subConsumptions), _) => subConsumptions
239-
}.flatMap(_.toSeq).toSeq
235+
val quotaStatus = checkTenantQuotaSpace(tenantId, tenantResourceConsumption)
236+
tenantQuotaStatus.put(tenantId, quotaStatus)
237+
// Expire tenant level exceeded app except already expired app
238+
if (interruptShuffleEnabled && quotaStatus.exceed) {
239+
val appConsumptions = userResourceConsumption.map {
240+
case (None, subConsumptionList) => computeSubConsumption(subConsumptionList)
241+
case (Some(subConsumptions), _) => subConsumptions
242+
}.flatMap(_.toSeq).toSeq
240243

241-
// Compute nonExpired app total usage
242-
val (expired, notExpired) = appConsumptions.partition { case (app, _) =>
243-
appQuotaStatus.containsKey(app)
244-
}
245-
tenantResourceConsumption =
246-
expired.map(_._2).foldLeft(tenantResourceConsumption)(_.subtract(_))
247-
expireApplication(
248-
tenantResourceConsumption,
249-
getTenantStorageQuota(tenantId),
250-
notExpired,
251-
TENANT_EXHAUSTED)
252-
(Option(appConsumptions), tenantConsumptionList.values)
253-
} else {
254-
(None, tenantConsumptionList.values)
244+
// Compute nonExpired app total usage
245+
val (expired, notExpired) = appConsumptions.partition { case (app, _) =>
246+
appQuotaStatus.containsKey(app)
255247
}
248+
tenantResourceConsumption =
249+
expired.map(_._2).foldLeft(tenantResourceConsumption)(_.subtract(_))
250+
expireApplication(
251+
tenantResourceConsumption,
252+
getTenantStorageQuota(tenantId),
253+
notExpired,
254+
TENANT_EXHAUSTED)
255+
(Option(appConsumptions), tenantConsumptionList.map(_._2).toSeq)
256+
} else {
257+
(None, tenantConsumptionList.map(_._2).toSeq)
258+
}
256259
}
257260

258261
// Expire cluster level exceeded app except already expired app
259262
clusterQuotaStatus = checkClusterQuotaSpace(clusterResourceConsumption)
260263
if (interruptShuffleEnabled && clusterQuotaStatus.exceed) {
261264
val appConsumptions = tenantResourceConsumption.map {
262-
case (None, subConsumptionList) => computeSubConsumption(subConsumptionList.toSeq)
265+
case (None, subConsumptionList) => computeSubConsumption(subConsumptionList)
263266
case (Some(subConsumptions), _) => subConsumptions
264267
}.flatMap(_.toSeq).toSeq
265268

master/src/test/scala/org/apache/celeborn/service/deploy/master/quota/QuotaManagerSuite.scala

+40-3
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,11 @@ import org.apache.celeborn.common.meta.WorkerInfo
3333
import org.apache.celeborn.common.metrics.source.ResourceConsumptionSource
3434
import org.apache.celeborn.common.protocol.message.ControlMessages.CheckQuotaResponse
3535
import org.apache.celeborn.common.quota.{ResourceConsumption, StorageQuota}
36+
import org.apache.celeborn.common.rpc.RpcEnv
3637
import org.apache.celeborn.common.util.{JavaUtils, Utils}
3738
import org.apache.celeborn.server.common.service.config.{ConfigService, DynamicConfigServiceFactory, FsConfigServiceImpl}
3839
import org.apache.celeborn.service.deploy.master.MasterSource
40+
import org.apache.celeborn.service.deploy.master.clustermeta.{AbstractMetaManager, SingleMasterMetaManager}
3941

4042
class QuotaManagerSuite extends CelebornFunSuite
4143
with BeforeAndAfterAll
@@ -52,6 +54,10 @@ class QuotaManagerSuite extends CelebornFunSuite
5254
10003,
5355
10004)
5456

57+
var statusSystem: AbstractMetaManager = _
58+
59+
var rpcEnv: RpcEnv = _
60+
5561
val workerToResourceConsumptions =
5662
JavaUtils.newConcurrentHashMap[String, util.Map[UserIdentifier, ResourceConsumption]]()
5763

@@ -68,14 +74,27 @@ class QuotaManagerSuite extends CelebornFunSuite
6874
DynamicConfigServiceFactory.reset()
6975
configService = DynamicConfigServiceFactory.getConfigService(conf)
7076

77+
rpcEnv = RpcEnv.create(
78+
"test-rpc",
79+
"rpc",
80+
"localhost",
81+
9001,
82+
conf,
83+
None)
84+
statusSystem = new SingleMasterMetaManager(rpcEnv, conf)
85+
statusSystem.availableWorkers.add(worker)
7186
quotaManager = new QuotaManager(
72-
workerToResourceConsumptions,
87+
statusSystem,
7388
new MasterSource(conf),
7489
resourceConsumptionSource,
7590
conf,
7691
configService)
7792
}
7893

94+
override def afterAll(): Unit = {
95+
rpcEnv.shutdown()
96+
}
97+
7998
test("test celeborn quota conf") {
8099
configService.refreshCache()
81100
assertEquals(
@@ -415,8 +434,17 @@ class QuotaManagerSuite extends CelebornFunSuite
415434
conf1.set(
416435
CelebornConf.DYNAMIC_CONFIG_STORE_FS_PATH.key,
417436
getTestResourceFile("dynamicConfig-quota-2.yaml").getPath)
437+
val rpcEnv = RpcEnv.create(
438+
"test-rpc",
439+
"rpc",
440+
"localhost",
441+
9002,
442+
conf,
443+
None)
444+
val statusSystem1 = new SingleMasterMetaManager(rpcEnv, conf)
445+
statusSystem1.availableWorkers.add(worker)
418446
val quotaManager1 = new QuotaManager(
419-
workerToResourceConsumptions,
447+
statusSystem1,
420448
new MasterSource(conf1),
421449
resourceConsumptionSource,
422450
conf1,
@@ -514,8 +542,17 @@ class QuotaManagerSuite extends CelebornFunSuite
514542
conf1.set(
515543
CelebornConf.DYNAMIC_CONFIG_STORE_FS_PATH.key,
516544
getTestResourceFile("dynamicConfig-quota-3.yaml").getPath)
545+
val rpcEnv = RpcEnv.create(
546+
"test-rpc",
547+
"rpc",
548+
"localhost",
549+
9003,
550+
conf,
551+
None)
552+
val statusSystem1 = new SingleMasterMetaManager(rpcEnv, conf)
553+
statusSystem1.availableWorkers.add(worker)
517554
val quotaManager1 = new QuotaManager(
518-
workerToResourceConsumptions,
555+
statusSystem1,
519556
new MasterSource(conf1),
520557
resourceConsumptionSource,
521558
conf1,

0 commit comments

Comments
 (0)