Details

    Description

      Possible deadlock was detected:

      placement.(*AppPlacementManager).initialise { m.Lock() } <<<<<
      placement.(*AppPlacementManager).initialise { } }
      placement.(*AppPlacementManager).UpdateRules { log.Log(log.Config).Info("Building new rule list for placement manager") }
      scheduler.(*PartitionContext).updatePartitionDetails { err := pc.placementManager.UpdateRules(conf.PlacementRules) }
      scheduler.(*ClusterContext).updateSchedulerConfig { err = part.updatePartitionDetails(p) }
      scheduler.(*ClusterContext).processRMConfigUpdateEvent { err = cc.updateSchedulerConfig(conf, rmID) }
      scheduler.(*Scheduler).handleRMEvent { case *rmevent.RMConfigUpdateEvent: }
      
      scheduler.(*PartitionContext).GetQueue { pc.RLock() } <<<<<
      scheduler.(*PartitionContext).GetQueue { func (pc *PartitionContext) GetQueue(name string) *objects.Queue { }
      placement.(*providedRule).placeApplication { // if we cannot create the queue must exist }
      placement.(*AppPlacementManager).PlaceApplication { queueName, err = checkRule.placeApplication(app, m.queueFn) }
      scheduler.(*PartitionContext).AddApplication { err := pc.getPlacementManager().PlaceApplication(app) }
      scheduler.(*ClusterContext).handleRMUpdateApplicationEvent { schedApp := objects.NewApplication(app, ugi, cc.rmEventHandler, request.RmID) }
      scheduler.(*Scheduler).handleRMEvent { case ev := <-s.pendingEvents: }
      

      Lock order is different between PartitionContext and AppPlacementManager.

      There's also an interference between PartitionContext and an Application object:

      objects.(*Application).SetTerminatedCallback { sa.Lock() } <<<<<
      objects.(*Application).SetTerminatedCallback { func (sa *Application) SetTerminatedCallback(callback func(appID string)) { }
      scheduler.(*PartitionContext).AddApplication { app.SetTerminatedCallback(pc.moveTerminatedApp) }
      scheduler.(*ClusterContext).handleRMUpdateApplicationEvent { schedApp := objects.NewApplication(app, ugi, cc.rmEventHandler, request.RmID) }
      scheduler.(*Scheduler).handleRMEvent { case ev := <-s.pendingEvents: }
      
      scheduler.(*PartitionContext).GetNode { pc.RLock() } <<<<<
      scheduler.(*PartitionContext).GetNode { func (pc *PartitionContext) GetNode(nodeID string) *objects.Node { }
      objects.(*Application).tryPlaceholderAllocate { // resource usage should not change anyway between placeholder and real one at this point }
      objects.(*Queue).TryPlaceholderAllocate { for _, app := range sq.sortApplications(true) { }
      objects.(*Queue).TryPlaceholderAllocate { for _, child := range sq.sortQueues() { }
      scheduler.(*PartitionContext).tryPlaceholderAllocate { alloc := pc.root.TryPlaceholderAllocate(pc.GetNodeIterator, pc.GetNode) }
      scheduler.(*ClusterContext).schedule { // nothing reserved that can be allocated try normal allocate }
      scheduler.(*Scheduler).MultiStepSchedule { // Note, this sleep only works in tests. }
      tests.TestDupReleasesInGangScheduling { // and it waits for the shim's confirmation }
      

      There's no need to have a locked access for PartitionContext.nodes. The base implementation of NodeCollection (baseNodeCollection) is already internally synchronized. The "nodes" field is set once. Therefore, no locking is necessary when accessing it.

      Attachments

        Issue Links

          Activity

            People

              pbacsko Peter Bacsko
              pbacsko Peter Bacsko
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: