Spark-SubmitTask

1.Rdd rdd中 reduce、fold、aggregate 这些ShuffleTask  还有collect、count这些finalTask 都会调用 sparkContext.runJob def reduce(f: (T, T) => T): T = withScope {   val cleanF = sc.clean(f)   val reducePartition: Iterator[T] => Option[T] = iter => {     if (iter.hasNext) {       Some(iter.reduceLeft(cleanF))     } else {       None     }   }   var jobResult: Option[T] = None   val mergeResult = (index: Int, taskResult: Option[T]) => {     if (taskResult.isDefined) {       jobResult = jobResult match {         case Some(value) => Some(f(value, taskResult.get))         case None => taskResult       }     }   }   sc.runJob(this, reducePartition, mergeResult)   // Get the final result out of our Option, or throw an exception if the RDD was empty   jobResult.getOrElse(throw new UnsupportedOperationException("empty collection")) }     def runJob[T, U: ClassTag](     rdd: RDD[T],     processPartition: Iterator[T] => U,     resultHandler: (Int, U) => Unit) {   val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter)   runJob[T, U](rdd, processFunc, 0 until rdd.partitions.length, resultHandler) }   2.SparkContext def runJob[T, U: ClassTag](     rdd: RDD[T],     func: (TaskContext, Iterator[T]) => U,     partitions: Seq[Int],     resultHandler: (Int, U) => Unit): Unit = {   if (stopped.get()) {     throw new IllegalStateException("SparkContext has been shutdown")   }   val callSite = getCallSite   val cleanedFunc = clean(func)   logInfo("Starting job: " + callSite.shortForm)   if (conf.getBoolean("spark.logLineage", false)) {     logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)   }   dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)   progressBar.foreach(_.finishAll())   rdd.doCheckpoint() }   3.DAGSchedule def runJob[T, U](     rdd: RDD[T],     func: (TaskContext, Iterator[T]) => U,     partitions: Seq[Int],     callSite: CallSite,     resultHandler: (Int, U) => Unit,     properties: Properties): Unit = {   val start = System.nanoTime   val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)   ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)   waiter.completionFuture.value.get match {     case scala.util.Success(_) =>       logInfo("Job %d finished: %s, took %f s".format         (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))     case scala.util.Failure(exception) =>       logInfo("Job %d failed: %s, took %f s".format         (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))       // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.       val callerStackTrace = Thread.currentThread().getStackTrace.tail       exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)       throw exception   } }     def submitJob[T, U](     rdd: RDD[T],     func: (TaskContext, Iterator[T]) => U,     partitions: Seq[Int],     callSite: CallSite,     resultHandler: (Int, U) => Unit,     properties: Properties): JobWaiter[U] = {   // Check to make sure we are not launching a task on a partition that does not exist.   val maxPartitions = rdd.partitions.length   partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>     throw new IllegalArgumentException(       "Attempting to access a non-existent partition: " + p + ". " +         "Total number of partitions: " + maxPartitions)   }       val jobId = nextJobId.getAndIncrement()   if (partitions.size == 0) {     // Return immediately if the job is running 0 tasks     return new JobWaiter[U](this, jobId, 0, resultHandler)   }       assert(partitions.size > 0)   val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]   val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)   eventProcessLoop.post((     jobId, rdd, func2, partitions.toArray, callSite, waiter,     SerializationUtils.clone(properties)))   waiter }   4.DAGSchedulerEventProcessLoop override def onReceive(event: DAGSchedulerEvent): Unit = {   val timerContext = timer.time()   try {     doOnReceive(event)   } finally {     timerContext.stop()   } }     private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {   case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>     dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)       case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>     dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)       case StageCancelled(stageId, reason) =>     dagScheduler.handleStageCancellation(stageId, reason)       case JobCancelled(jobId, reason) =>     dagScheduler.handleJobCancellation(jobId, reason)       case JobGroupCancelled(groupId) =>     dagScheduler.handleJobGroupCancelled(groupId)       case AllJobsCancelled =>     dagScheduler.doCancelAllJobs()       case ExecutorAdded(execId, host) =>     dagScheduler.handleExecutorAdded(execId, host)       case ExecutorLost(execId, reason) =>     val workerLost = reason match {       case SlaveLost(_, true) => true       case _ => false     }     dagScheduler.handleExecutorLost(execId, workerLost)       case WorkerRemoved(workerId, host, message) =>     dagScheduler.handleWorkerRemoved(workerId, host, message)       case BeginEvent(task, taskInfo) =>     dagScheduler.handleBeginEvent(task, taskInfo)       case SpeculativeTaskSubmitted(task) =>     dagScheduler.handleSpeculativeTaskSubmitted(task)       case GettingResultEvent(taskInfo) =>     dagScheduler.handleGetTaskResult(taskInfo)       case completion: CompletionEvent =>     dagScheduler.handleTaskCompletion(completion)       case TaskSetFailed(taskSet, reason, exception) =>     dagScheduler.handleTaskSetFailed(taskSet, reason, exception)       case ResubmitFailedStages =>     dagScheduler.resubmitFailedStages() }   5.DAGScheduler   M-submitStage 和 M-getMissingParentStages 构成spark stage划分  划分过程中创建stage 是 M-getOrCreateShuffleMapStage 第一次会创建,第二次就是从map中取(也就是从内存中取)   把一个app 划分成多个stage 使用M-submitMissingTasks 提交过去   M-submitStage 划分过程 ResultStage 是最后一个stage , 假如ResultStage 依赖ShuffleMapStage B ShuffleMapStage B 依赖ShuffleMapStage A 会优先提交A,提交后把 B 和Result 放入 waitingStages   M-submitMissingTasks  根据不同的Stage  将rdd 和 func 或者 stage.shuffleDep 封装到 taskBinaryBytes 最后更具不同的partition id放入Task 中  存入taskset 中   等A 运行完之后,最后一行 submitWaitingChildStages(stage)   M-submitWaitingChildStages 根据当前的stage 从waitingStages 找出当前的stage 的子stage  然后再次提交到  submitStage   M-getMissingParentStages if (!mapStage.isAvailable)  则不为true 则不会再次提交 这个是获取mapOutputTrackerMaster 中  _numAvailableOutputs 数量是否和分区数相等。如果相等,则表示 该Stage 已经处理过   taskBinaryBytes = stage match {   case stage: ShuffleMapStage =>     JavaUtils.bufferToArray(       closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))   case stage: ResultStage =>     JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef)) }     taskBinary = sc.broadcast(taskBinaryBytes)     new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,   taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),   Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())   new ResultTask(stage.id, stage.latestInfo.attemptNumber,   taskBinary, part, locs, id, properties, serializedTaskMetrics,   Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,   stage.rdd.isBarrier())     private[scheduler] def handleJobSubmitted(jobId: Int,     finalRDD: RDD[_],     func: (TaskContext, Iterator[_]) => _,     partitions: Array[Int],     callSite: CallSite,     listener: JobListener,     properties: Properties) {   var finalStage: ResultStage = null   try {     // New stage creation may throw an exception if, for example, jobs are run on a     // HadoopRDD whose underlying HDFS files have been deleted.     finalStage =  createResultStage(finalRDD, func, partitions, jobId, callSite)   } catch {     case e: BarrierJobSlotsNumberCheckFailed =>       logWarning(s"The job $jobId requires to run a barrier stage that requires more slots " +         "than the total number of slots in the cluster currently.")       // If jobId doesn't exist in the map, Scala coverts its value null to 0: Int automatically.       val numCheckFailures = barrierJobIdToNumTasksCheckFailures.compute(jobId,         new BiFunction[Int, Int, Int] {           override def apply(key: Int, value: Int): Int = value + 1         })       if (numCheckFailures <= maxFailureNumTasksCheck) {         messageScheduler.schedule(           new Runnable {             override def run(): Unit = eventProcessLoop.post(JobSubmitted(jobId, finalRDD, func,               partitions, callSite, listener, properties))           },           timeIntervalNumTasksCheck,           TimeUnit.SECONDS         )         return       } else {         // Job failed, clear internal data.         barrierJobIdToNumTasksCheckFailures.remove(jobId)         listener.jobFailed(e)         return       }         case e: Exception =>       logWarning("Creating new stage failed due to exception - job: " + jobId, e)       listener.jobFailed(e)       return   }   // Job submitted, clear internal data.   barrierJobIdToNumTasksCheckFailures.remove(jobId)       val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)   clearCacheLocs()   logInfo("Got job %s (%s) with %d output partitions".format(     job.jobId, callSite.shortForm, partitions.length))   logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")   logInfo("Parents of final stage: " + finalStage.parents)   logInfo("Missing parents: " + getMissingParentStages(finalStage))       val jobSubmissionTime = clock.getTimeMillis()   jobIdToActiveJob(jobId) = job   activeJobs += job   finalStage.setActiveJob(job)   val stageIds = jobIdToStageIds(jobId).toArray   val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))   listenerBus.post(     SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))   submitStage(finalStage) }         private def submitStage(stage: Stage) {   val jobId = activeJobForStage(stage)   if (jobId.isDefined) {     logDebug("submitStage(" + stage + ")")     if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {       val missing = getMissingParentStages(stage).sortBy(_.id)       logDebug("missing: " + missing)       if (missing.isEmpty) {         logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")         submitMissingTasks(stage, jobId.get)       } else {         for (parent <- missing) {           submitStage(parent)         }         waitingStages += stage       }     }   } else {     abortStage(stage, "No active job for stage " + stage.id, None)   } }     private def getMissingParentStages(stage: Stage): List[Stage] = {   val missing = new HashSet[Stage]   val visited = new HashSet[RDD[_]]   // We are manually maintaining a stack here to prevent *Error   // caused by recursively visiting   val waitingForVisit = new ArrayStack[RDD[_]]   def visit(rdd: RDD[_]) {     if (!visited(rdd)) {       visited += rdd       val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)       if (rddHasUncachedPartitions) {         for (dep <- rdd.dependencies) {           dep match {             case shufDep: ShuffleDependency[_, _, _] =>               val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)               if (!mapStage.isAvailable) {                 missing += mapStage               }             case narrowDep: NarrowDependency[_] =>               waitingForVisit.push(narrowDep.rdd)           }         }       }     }   }   waitingForVisit.push(stage.rdd)   while (waitingForVisit.nonEmpty) {     visit(waitingForVisit.pop())   }   missing.toList }       private def submitMissingTasks(stage: Stage, jobId: Int) {   logDebug("submitMissingTasks(" + stage + ")")       // First figure out the indexes of partition ids to compute.   val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()       // Use the scheduling pool, job group, description, etc. from an ActiveJob associated   // with this Stage   val properties = jobIdToActiveJob(jobId).properties       runningStages += stage   // SparkListenerStageSubmitted should be posted before testing whether tasks are   // serializable. If tasks are not serializable, a SparkListenerStageCompleted event   // will be posted, which should always come after a corresponding SparkListenerStageSubmitted   // event.   stage match {     case s: ShuffleMapStage =>       outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)     case s: ResultStage =>       outputCommitCoordinator.stageStart(         stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)   }   val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {     stage match {       case s: ShuffleMapStage =>         partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap       case s: ResultStage =>         partitionsToCompute.map { id =>           val p = s.partitions(id)           (id, getPreferredLocs(stage.rdd, p))         }.toMap     }   } catch {     case NonFatal(e) =>       stage.makeNewStageAttempt(partitionsToCompute.size)       listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))       abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))       runningStages -= stage       return   }       stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)       // If there are tasks to execute, record the submission time of the stage. Otherwise,   // post the even without the submission time, which indicates that this stage was   // skipped.   if (partitionsToCompute.nonEmpty) {     stage.latestInfo.submissionTime = Some(clock.getTimeMillis())   }   listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))       // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.   // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast   // the serialized copy of the RDD and for each task we will deserialize it, which means each   // task gets a different copy of the RDD. This provides stronger isolation between tasks that   // might modify state of objects referenced in their closures. This is necessary in Hadoop   // where the JobConf/Configuration object is not thread-safe.   var taskBinary: Broadcast[Array[Byte]] = null   var partitions: Array[Partition] = null   try {     // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).     // For ResultTask, serialize and broadcast (rdd, func).     var taskBinaryBytes: Array[Byte] = null     // taskBinaryBytes and partitions are both effected by the checkpoint status. We need     // this synchronization in case another concurrent job is checkpointing this RDD, so we get a     // consistent view of both variables.     RDDCheckpointData.synchronized {       taskBinaryBytes = stage match {         case stage: ShuffleMapStage =>           JavaUtils.bufferToArray(             closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))         case stage: ResultStage =>           JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))       }           partitions = stage.rdd.partitions     }         taskBinary = sc.broadcast(taskBinaryBytes)   } catch {     // In the case of a failure during serialization, abort the stage.     case e: NotSerializableException =>       abortStage(stage, "Task not serializable: " + e.toString, Some(e))       runningStages -= stage           // Abort execution       return     case NonFatal(e) =>       abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e))       runningStages -= stage       return   }       val tasks: Seq[Task[_]] = try {     val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()     stage match {       case stage: ShuffleMapStage =>         stage.pendingPartitions.clear()         partitionsToCompute.map { id =>           val locs = taskIdToLocations(id)           val part = partitions(id)           stage.pendingPartitions += id           new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,             taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),             Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())         }           case stage: ResultStage =>         partitionsToCompute.map { id =>           val p: Int = stage.partitions(id)           val part = partitions(p)           val locs = taskIdToLocations(id)           new ResultTask(stage.id, stage.latestInfo.attemptNumber,             taskBinary, part, locs, id, properties, serializedTaskMetrics,             Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,             stage.rdd.isBarrier())         }     }   } catch {     case NonFatal(e) =>       abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))       runningStages -= stage       return   }       if (tasks.size > 0) {     logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +       s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")     taskScheduler.submitTasks(new TaskSet(       tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))   } else {     // Because we posted SparkListenerStageSubmitted earlier, we should mark     // the stage as completed here in case there are no tasks to run     markStageAsFinished(stage, None)         stage match {       case stage: ShuffleMapStage =>         logDebug(s"Stage ${stage} is actually done; " +             s"(available: ${stage.isAvailable}," +             s"available outputs: ${stage.numAvailableOutputs}," +             s"partitions: ${stage.numPartitions})")         markMapStageJobsAsFinished(stage)       case stage : ResultStage =>         logDebug(s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})")     }     submitWaitingChildStages(stage)   } }       private def submitWaitingChildStages(parent: Stage) {   logTrace(s"Checking if any dependencies of $parent are now runnable")   logTrace("running: " + runningStages)   logTrace("waiting: " + waitingStages)   logTrace("failed: " + failedStages)   val childStages = waitingStages.filter(_.parents.contains(parent)).toArray   waitingStages --= childStages   for (stage <- childStages.sortBy(_.firstJobId)) {     submitStage(stage)   } }   6.TaskScheduleImpl 这部实际是对taskset 进行封装成TaskSetManager 放入队列 override def submitTasks(taskSet: TaskSet) {   val tasks = taskSet.tasks   logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")   this.synchronized {     val manager = createTaskSetManager(taskSet, maxTaskFailures)     val stage = taskSet.stageId     val stageTaskSets =       taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])     stageTaskSets(taskSet.stageAttemptId) = manager     val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>       ts.taskSet != taskSet && !ts.isZombie     }     if (conflictingTaskSet) {       throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +         s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")     }     //这一步实际上把taskset放入调度队列中     schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)         if (!isLocal && !hasReceivedTask) {       starvationTimer.scheduleAtFixedRate(new TimerTask() {         override def run() {           if (!hasLaunchedTask) {             logWarning("Initial job has not accepted any resources; " +               "check your cluster UI to ensure that workers are registered " +               "and have sufficient resources")           } else {             this.cancel()           }         }       }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)     }     hasReceivedTask = true   }     //通知 StandaloneSchedulerBackend 进行通知,对任务队列中的task 进行分配executor    backend.reviveOffers() }     7.FIFOSchedulableBuilder //将TaskSetManager 放入调度队列中 override def addTaskSetManager(manager: Schedulable, properties: Properties) {   rootPool.addSchedulable(manager) }     8.CoarseGrainedSchedulerBackend 主要是对executor进行过滤,然后executor 和 task 分配 最后启动task,也就是向executor 发送launchtask 的消息  launchTask 其实发送的是TaskDescription,TaskDescription 包含了 task 和 executor 信息 TaskSetManager 生成的 TaskDescription   private def makeOffers() {   // Make sure no executor is killed while some task is launching on it   val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {     // Filter out executors under killing     val activeExecutors = executorDataMap.filterKeys(executorIsAlive)     val workOffers = activeExecutors.map {       case (id, executorData) =>         new WorkerOffer(id, executorData.executorHost, executorData.freeCores,           Some(executorData.executorAddress.hostPort))     }.toIndexedSeq     scheduler.resourceOffers(workOffers)   }   if (!taskDescs.isEmpty) {     launchTasks(taskDescs)   } }       def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {   // Mark each slave as alive and remember its hostname   // Also track if new executor is added   var newExecAvail = false   for (o <- offers) {     if (!hostToExecutors.contains(o.host)) {       hostToExecutors(o.host) = new HashSet[String]()     }     if (!executorIdToRunningTaskIds.contains(o.executorId)) {       hostToExecutors(o.host) += o.executorId       executorAdded(o.executorId, o.host)       executorIdToHost(o.executorId) = o.host       executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()       newExecAvail = true     }     for (rack <- getRackForHost(o.host)) {       hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host     }   }       // Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do   // this here to avoid a separate thread and added synchronization overhead, and also because   // updating the blacklist is only relevant when task offers are being made.   blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())       val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>     offers.filter { offer =>       !blacklistTracker.isNodeBlacklisted(offer.host) &&         !blacklistTracker.isExecutorBlacklisted(offer.executorId)     }   }.getOrElse(offers)       val shuffledOffers = shuffleOffers(filteredOffers)   // Build a list of tasks to assign to each worker.   val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))   val availableCpus = shuffledOffers.map(o => o.cores).toArray   val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum   val sortedTaskSets = rootPool.getSortedTaskSetQueue   for (taskSet <- sortedTaskSets) {     logDebug("parentName: %s, name: %s, runningTasks: %s".format(       taskSet.parent.name, taskSet.name, taskSet.runningTasks))     if (newExecAvail) {       taskSet.executorAdded()     }   }       // Take each TaskSet in our scheduling order, and then offer it each node in increasing order   // of locality levels so that it gets a chance to launch local tasks on all of them.   // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY   for (taskSet <- sortedTaskSets) {     // Skip the barrier taskSet if the available slots are less than the number of pending tasks.     if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {       // Skip the launch process.       // TODO SPARK-24819 If the job requires more slots than available (both busy and free       // slots), fail the job on submit.       logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +         s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " +         s"number of available slots is $availableSlots.")     } else {       var launchedAnyTask = false       // Record all the executor IDs assigned barrier tasks on.       val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()       for (currentMaxLocality <- taskSet.myLocalityLevels) {         var launchedTaskAtCurrentMaxLocality = false         do {           launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,             currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)           launchedAnyTask |= launchedTaskAtCurrentMaxLocality         } while (launchedTaskAtCurrentMaxLocality)       }           if (!launchedAnyTask) {         taskSet.getCompletelyBlacklistedTaskIfAny(hostToExecutors).foreach { taskIndex =>             // If the taskSet is unschedulable we try to find an existing idle blacklisted             // executor. If we cannot find one, we abort immediately. Else we kill the idle             // executor and kick off an abortTimer which if it doesn't schedule a task within the             // the timeout will abort the taskSet if we were unable to schedule any task from the             // taskSet.             // Note 1: We keep track of schedulability on a per taskSet basis rather than on a per             // task basis.             // Note 2: The taskSet can still be aborted when there are more than one idle             // blacklisted executors and dynamic allocation is on. This can happen when a killed             // idle executor isn't replaced in time by ExecutorAllocationManager as it relies on             // pending tasks and doesn't kill executors on idle timeouts, resulting in the abort             // timer to expire and abort the taskSet.             executorIdToRunningTaskIds.find(x => !isExecutorBusy(x._1)) match {               case Some ((executorId, _)) =>                 if (!unschedulableTaskSetToExpiryTime.contains(taskSet)) {                   blacklistTrackerOpt.foreach(blt => blt.killBlacklistedIdleExecutor(executorId))                       val timeout = conf.get(config.UNSCHEDULABLE_TASKSET_TIMEOUT) * 1000                   unschedulableTaskSetToExpiryTime(taskSet) = clock.getTimeMillis() + timeout                   logInfo(s"Waiting for $timeout ms for completely "                     + s"blacklisted task to be schedulable again before aborting $taskSet.")                   abortTimer.schedule(                     createUnschedulableTaskSetAbortTimer(taskSet, taskIndex), timeout)                 }               case None => // Abort Immediately                 logInfo("Cannot schedule any task because of complete blacklisting. No idle" +                   s" executors can be found to kill. Aborting $taskSet." )                 taskSet.abortSinceCompletelyBlacklisted(taskIndex)             }         }       } else {         // We want to defer killing any taskSets as long as we have a non blacklisted executor         // which can be used to schedule a task from any active taskSets. This ensures that the         // job can make progress.         // Note: It is theoretically possible that a taskSet never gets scheduled on a         // non-blacklisted executor and the abort timer doesn't kick in because of a constant         // submission of new TaskSets. See the PR for more details.         if (unschedulableTaskSetToExpiryTime.nonEmpty) {           logInfo("Clearing the expiry times for all unschedulable taskSets as a task was " +             "recently scheduled.")           unschedulableTaskSetToExpiryTime.clear()         }       }           if (launchedAnyTask && taskSet.isBarrier) {         // Check whether the barrier tasks are partially launched.         // TODO SPARK-24818 handle the assert failure case (that can happen when some locality         // requirements are not fulfilled, and we should revert the launched tasks).         require(addressesWithDescs.size == taskSet.numTasks,           s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +             s"because only ${addressesWithDescs.size} out of a total number of " +             s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " +             "been blacklisted or cannot fulfill task locality requirements.")             // materialize the barrier coordinator.         maybeInitBarrierCoordinator()             // Update the taskInfos into all the barrier task properties.         val addressesStr = addressesWithDescs           // Addresses ordered by partitionId           .sortBy(_._2.partitionId)           .map(_._1)           .mkString(",")         addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))             logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +           s"stage ${taskSet.stageId}.")       }     }   }       // TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get   // launched within a configured time.   if (tasks.size > 0) {     hasLaunchedTask = true   }   return tasks }         private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {   for (task <- tasks.flatten) {     val serializedTask = TaskDescription.encode(task)     if (serializedTask.limit() >= maxRpcMessageSize) {       Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>         try {           var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +             "spark.rpc.message.maxSize (%d bytes). Consider increasing " +             "spark.rpc.message.maxSize or using broadcast variables for large values."           msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)           taskSetMgr.abort(msg)         } catch {           case e: Exception => logError("Exception in error callback", e)         }       }     }     else {       val executorData = executorDataMap(task.executorId)       executorData.freeCores -= scheduler.CPUS_PER_TASK           logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +         s"${executorData.executorHost}.")           executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))     }   } }   1.Rdd rdd中 reduce、fold、aggregate 这些ShuffleTask  还有collect、count这些finalTask 都会调用 sparkContext.runJob def reduce(f: (T, T) => T): T = withScope {   val cleanF = sc.clean(f)   val reducePartition: Iterator[T] => Option[T] = iter => {     if (iter.hasNext) {       Some(iter.reduceLeft(cleanF))     } else {       None     }   }   var jobResult: Option[T] = None   val mergeResult = (index: Int, taskResult: Option[T]) => {     if (taskResult.isDefined) {       jobResult = jobResult match {         case Some(value) => Some(f(value, taskResult.get))         case None => taskResult       }     }   }   sc.runJob(this, reducePartition, mergeResult)   // Get the final result out of our Option, or throw an exception if the RDD was empty   jobResult.getOrElse(throw new UnsupportedOperationException("empty collection")) }     def runJob[T, U: ClassTag](     rdd: RDD[T],     processPartition: Iterator[T] => U,     resultHandler: (Int, U) => Unit) {   val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter)   runJob[T, U](rdd, processFunc, 0 until rdd.partitions.length, resultHandler) }   2.SparkContext def runJob[T, U: ClassTag](     rdd: RDD[T],     func: (TaskContext, Iterator[T]) => U,     partitions: Seq[Int],     resultHandler: (Int, U) => Unit): Unit = {   if (stopped.get()) {     throw new IllegalStateException("SparkContext has been shutdown")   }   val callSite = getCallSite   val cleanedFunc = clean(func)   logInfo("Starting job: " + callSite.shortForm)   if (conf.getBoolean("spark.logLineage", false)) {     logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)   }   dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)   progressBar.foreach(_.finishAll())   rdd.doCheckpoint() }   3.DAGSchedule def runJob[T, U](     rdd: RDD[T],     func: (TaskContext, Iterator[T]) => U,     partitions: Seq[Int],     callSite: CallSite,     resultHandler: (Int, U) => Unit,     properties: Properties): Unit = {   val start = System.nanoTime   val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)   ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)   waiter.completionFuture.value.get match {     case scala.util.Success(_) =>       logInfo("Job %d finished: %s, took %f s".format         (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))     case scala.util.Failure(exception) =>       logInfo("Job %d failed: %s, took %f s".format         (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))       // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.       val callerStackTrace = Thread.currentThread().getStackTrace.tail       exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)       throw exception   } }     def submitJob[T, U](     rdd: RDD[T],     func: (TaskContext, Iterator[T]) => U,     partitions: Seq[Int],     callSite: CallSite,     resultHandler: (Int, U) => Unit,     properties: Properties): JobWaiter[U] = {   // Check to make sure we are not launching a task on a partition that does not exist.   val maxPartitions = rdd.partitions.length   partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>     throw new IllegalArgumentException(       "Attempting to access a non-existent partition: " + p + ". " +         "Total number of partitions: " + maxPartitions)   }       val jobId = nextJobId.getAndIncrement()   if (partitions.size == 0) {     // Return immediately if the job is running 0 tasks     return new JobWaiter[U](this, jobId, 0, resultHandler)   }       assert(partitions.size > 0)   val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]   val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)   eventProcessLoop.post((     jobId, rdd, func2, partitions.toArray, callSite, waiter,     SerializationUtils.clone(properties)))   waiter }   4.DAGSchedulerEventProcessLoop override def onReceive(event: DAGSchedulerEvent): Unit = {   val timerContext = timer.time()   try {     doOnReceive(event)   } finally {     timerContext.stop()   } }     private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {   case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>     dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)       case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>     dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)       case StageCancelled(stageId, reason) =>     dagScheduler.handleStageCancellation(stageId, reason)       case JobCancelled(jobId, reason) =>     dagScheduler.handleJobCancellation(jobId, reason)       case JobGroupCancelled(groupId) =>     dagScheduler.handleJobGroupCancelled(groupId)       case AllJobsCancelled =>     dagScheduler.doCancelAllJobs()       case ExecutorAdded(execId, host) =>     dagScheduler.handleExecutorAdded(execId, host)       case ExecutorLost(execId, reason) =>     val workerLost = reason match {       case SlaveLost(_, true) => true       case _ => false     }     dagScheduler.handleExecutorLost(execId, workerLost)       case WorkerRemoved(workerId, host, message) =>     dagScheduler.handleWorkerRemoved(workerId, host, message)       case BeginEvent(task, taskInfo) =>     dagScheduler.handleBeginEvent(task, taskInfo)       case SpeculativeTaskSubmitted(task) =>     dagScheduler.handleSpeculativeTaskSubmitted(task)       case GettingResultEvent(taskInfo) =>     dagScheduler.handleGetTaskResult(taskInfo)       case completion: CompletionEvent =>     dagScheduler.handleTaskCompletion(completion)       case TaskSetFailed(taskSet, reason, exception) =>     dagScheduler.handleTaskSetFailed(taskSet, reason, exception)       case ResubmitFailedStages =>     dagScheduler.resubmitFailedStages() }   5.DAGScheduler   M-submitStage 和 M-getMissingParentStages 构成spark stage划分  划分过程中创建stage 是 M-getOrCreateShuffleMapStage 第一次会创建,第二次就是从map中取(也就是从内存中取)   把一个app 划分成多个stage 使用M-submitMissingTasks 提交过去   M-submitStage 划分过程 ResultStage 是最后一个stage , 假如ResultStage 依赖ShuffleMapStage B ShuffleMapStage B 依赖ShuffleMapStage A 会优先提交A,提交后把 B 和Result 放入 waitingStages   M-submitMissingTasks  根据不同的Stage  将rdd 和 func 或者 stage.shuffleDep 封装到 taskBinaryBytes 最后更具不同的partition id放入Task 中  存入taskset 中   等A 运行完之后,最后一行 submitWaitingChildStages(stage)   M-submitWaitingChildStages 根据当前的stage 从waitingStages 找出当前的stage 的子stage  然后再次提交到  submitStage   M-getMissingParentStages if (!mapStage.isAvailable)  则不为true 则不会再次提交 这个是获取mapOutputTrackerMaster 中  _numAvailableOutputs 数量是否和分区数相等。如果相等,则表示 该Stage 已经处理过   taskBinaryBytes = stage match {   case stage: ShuffleMapStage =>     JavaUtils.bufferToArray(       closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))   case stage: ResultStage =>     JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef)) }     taskBinary = sc.broadcast(taskBinaryBytes)     new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,   taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),   Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())   new ResultTask(stage.id, stage.latestInfo.attemptNumber,   taskBinary, part, locs, id, properties, serializedTaskMetrics,   Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,   stage.rdd.isBarrier())     private[scheduler] def handleJobSubmitted(jobId: Int,     finalRDD: RDD[_],     func: (TaskContext, Iterator[_]) => _,     partitions: Array[Int],     callSite: CallSite,     listener: JobListener,     properties: Properties) {   var finalStage: ResultStage = null   try {     // New stage creation may throw an exception if, for example, jobs are run on a     // HadoopRDD whose underlying HDFS files have been deleted.     finalStage =  createResultStage(finalRDD, func, partitions, jobId, callSite)   } catch {     case e: BarrierJobSlotsNumberCheckFailed =>       logWarning(s"The job $jobId requires to run a barrier stage that requires more slots " +         "than the total number of slots in the cluster currently.")       // If jobId doesn't exist in the map, Scala coverts its value null to 0: Int automatically.       val numCheckFailures = barrierJobIdToNumTasksCheckFailures.compute(jobId,         new BiFunction[Int, Int, Int] {           override def apply(key: Int, value: Int): Int = value + 1         })       if (numCheckFailures <= maxFailureNumTasksCheck) {         messageScheduler.schedule(           new Runnable {             override def run(): Unit = eventProcessLoop.post(JobSubmitted(jobId, finalRDD, func,               partitions, callSite, listener, properties))           },           timeIntervalNumTasksCheck,           TimeUnit.SECONDS         )         return       } else {         // Job failed, clear internal data.         barrierJobIdToNumTasksCheckFailures.remove(jobId)         listener.jobFailed(e)         return       }         case e: Exception =>       logWarning("Creating new stage failed due to exception - job: " + jobId, e)       listener.jobFailed(e)       return   }   // Job submitted, clear internal data.   barrierJobIdToNumTasksCheckFailures.remove(jobId)       val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)   clearCacheLocs()   logInfo("Got job %s (%s) with %d output partitions".format(     job.jobId, callSite.shortForm, partitions.length))   logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")   logInfo("Parents of final stage: " + finalStage.parents)   logInfo("Missing parents: " + getMissingParentStages(finalStage))       val jobSubmissionTime = clock.getTimeMillis()   jobIdToActiveJob(jobId) = job   activeJobs += job   finalStage.setActiveJob(job)   val stageIds = jobIdToStageIds(jobId).toArray   val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))   listenerBus.post(     SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))   submitStage(finalStage) }         private def submitStage(stage: Stage) {   val jobId = activeJobForStage(stage)   if (jobId.isDefined) {     logDebug("submitStage(" + stage + ")")     if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {       val missing = getMissingParentStages(stage).sortBy(_.id)       logDebug("missing: " + missing)       if (missing.isEmpty) {         logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")         submitMissingTasks(stage, jobId.get)       } else {         for (parent <- missing) {           submitStage(parent)         }         waitingStages += stage       }     }   } else {     abortStage(stage, "No active job for stage " + stage.id, None)   } }     private def getMissingParentStages(stage: Stage): List[Stage] = {   val missing = new HashSet[Stage]   val visited = new HashSet[RDD[_]]   // We are manually maintaining a stack here to prevent *Error   // caused by recursively visiting   val waitingForVisit = new ArrayStack[RDD[_]]   def visit(rdd: RDD[_]) {     if (!visited(rdd)) {       visited += rdd       val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)       if (rddHasUncachedPartitions) {         for (dep <- rdd.dependencies) {           dep match {             case shufDep: ShuffleDependency[_, _, _] =>               val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)               if (!mapStage.isAvailable) {                 missing += mapStage               }             case narrowDep: NarrowDependency[_] =>               waitingForVisit.push(narrowDep.rdd)           }         }       }     }   }   waitingForVisit.push(stage.rdd)   while (waitingForVisit.nonEmpty) {     visit(waitingForVisit.pop())   }   missing.toList }       private def submitMissingTasks(stage: Stage, jobId: Int) {   logDebug("submitMissingTasks(" + stage + ")")       // First figure out the indexes of partition ids to compute.   val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()       // Use the scheduling pool, job group, description, etc. from an ActiveJob associated   // with this Stage   val properties = jobIdToActiveJob(jobId).properties       runningStages += stage   // SparkListenerStageSubmitted should be posted before testing whether tasks are   // serializable. If tasks are not serializable, a SparkListenerStageCompleted event   // will be posted, which should always come after a corresponding SparkListenerStageSubmitted   // event.   stage match {     case s: ShuffleMapStage =>       outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)     case s: ResultStage =>       outputCommitCoordinator.stageStart(         stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)   }   val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {     stage match {       case s: ShuffleMapStage =>         partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap       case s: ResultStage =>         partitionsToCompute.map { id =>           val p = s.partitions(id)           (id, getPreferredLocs(stage.rdd, p))         }.toMap     }   } catch {     case NonFatal(e) =>       stage.makeNewStageAttempt(partitionsToCompute.size)       listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))       abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))       runningStages -= stage       return   }       stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)       // If there are tasks to execute, record the submission time of the stage. Otherwise,   // post the even without the submission time, which indicates that this stage was   // skipped.   if (partitionsToCompute.nonEmpty) {     stage.latestInfo.submissionTime = Some(clock.getTimeMillis())   }   listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))       // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.   // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast   // the serialized copy of the RDD and for each task we will deserialize it, which means each   // task gets a different copy of the RDD. This provides stronger isolation between tasks that   // might modify state of objects referenced in their closures. This is necessary in Hadoop   // where the JobConf/Configuration object is not thread-safe.   var taskBinary: Broadcast[Array[Byte]] = null   var partitions: Array[Partition] = null   try {     // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).     // For ResultTask, serialize and broadcast (rdd, func).     var taskBinaryBytes: Array[Byte] = null     // taskBinaryBytes and partitions are both effected by the checkpoint status. We need     // this synchronization in case another concurrent job is checkpointing this RDD, so we get a     // consistent view of both variables.     RDDCheckpointData.synchronized {       taskBinaryBytes = stage match {         case stage: ShuffleMapStage =>           JavaUtils.bufferToArray(             closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))         case stage: ResultStage =>           JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))       }           partitions = stage.rdd.partitions     }         taskBinary = sc.broadcast(taskBinaryBytes)   } catch {     // In the case of a failure during serialization, abort the stage.     case e: NotSerializableException =>       abortStage(stage, "Task not serializable: " + e.toString, Some(e))       runningStages -= stage           // Abort execution       return     case NonFatal(e) =>       abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e))       runningStages -= stage       return   }       val tasks: Seq[Task[_]] = try {     val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()     stage match {       case stage: ShuffleMapStage =>         stage.pendingPartitions.clear()         partitionsToCompute.map { id =>           val locs = taskIdToLocations(id)           val part = partitions(id)           stage.pendingPartitions += id           new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,             taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),             Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())         }           case stage: ResultStage =>         partitionsToCompute.map { id =>           val p: Int = stage.partitions(id)           val part = partitions(p)           val locs = taskIdToLocations(id)           new ResultTask(stage.id, stage.latestInfo.attemptNumber,             taskBinary, part, locs, id, properties, serializedTaskMetrics,             Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,             stage.rdd.isBarrier())         }     }   } catch {     case NonFatal(e) =>       abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))       runningStages -= stage       return   }       if (tasks.size > 0) {     logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +       s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")     taskScheduler.submitTasks(new TaskSet(       tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))   } else {     // Because we posted SparkListenerStageSubmitted earlier, we should mark     // the stage as completed here in case there are no tasks to run     markStageAsFinished(stage, None)         stage match {       case stage: ShuffleMapStage =>         logDebug(s"Stage ${stage} is actually done; " +             s"(available: ${stage.isAvailable}," +             s"available outputs: ${stage.numAvailableOutputs}," +             s"partitions: ${stage.numPartitions})")         markMapStageJobsAsFinished(stage)       case stage : ResultStage =>         logDebug(s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})")     }     submitWaitingChildStages(stage)   } }       private def submitWaitingChildStages(parent: Stage) {   logTrace(s"Checking if any dependencies of $parent are now runnable")   logTrace("running: " + runningStages)   logTrace("waiting: " + waitingStages)   logTrace("failed: " + failedStages)   val childStages = waitingStages.filter(_.parents.contains(parent)).toArray   waitingStages --= childStages   for (stage <- childStages.sortBy(_.firstJobId)) {     submitStage(stage)   } }   6.TaskScheduleImpl 这部实际是对taskset 进行封装成TaskSetManager 放入队列 override def submitTasks(taskSet: TaskSet) {   val tasks = taskSet.tasks   logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")   this.synchronized {     val manager = createTaskSetManager(taskSet, maxTaskFailures)     val stage = taskSet.stageId     val stageTaskSets =       taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])     stageTaskSets(taskSet.stageAttemptId) = manager     val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>       ts.taskSet != taskSet && !ts.isZombie     }     if (conflictingTaskSet) {       throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +         s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")     }     //这一步实际上把taskset放入调度队列中     schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)         if (!isLocal && !hasReceivedTask) {       starvationTimer.scheduleAtFixedRate(new TimerTask() {         override def run() {           if (!hasLaunchedTask) {             logWarning("Initial job has not accepted any resources; " +               "check your cluster UI to ensure that workers are registered " +               "and have sufficient resources")           } else {             this.cancel()           }         }       }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)     }     hasReceivedTask = true   }     //通知 StandaloneSchedulerBackend 进行通知,对任务队列中的task 进行分配executor    backend.reviveOffers() }     7.FIFOSchedulableBuilder //将TaskSetManager 放入调度队列中 override def addTaskSetManager(manager: Schedulable, properties: Properties) {   rootPool.addSchedulable(manager) }     8.CoarseGrainedSchedulerBackend 主要是对executor进行过滤,然后executor 和 task 分配 最后启动task,也就是向executor 发送launchtask 的消息  launchTask 其实发送的是TaskDescription,TaskDescription 包含了 task 和 executor 信息 TaskSetManager 生成的 TaskDescription   private def makeOffers() {   // Make sure no executor is killed while some task is launching on it   val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {     // Filter out executors under killing     val activeExecutors = executorDataMap.filterKeys(executorIsAlive)     val workOffers = activeExecutors.map {       case (id, executorData) =>         new WorkerOffer(id, executorData.executorHost, executorData.freeCores,           Some(executorData.executorAddress.hostPort))     }.toIndexedSeq     scheduler.resourceOffers(workOffers)   }   if (!taskDescs.isEmpty) {     launchTasks(taskDescs)   } }       def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {   // Mark each slave as alive and remember its hostname   // Also track if new executor is added   var newExecAvail = false   for (o <- offers) {     if (!hostToExecutors.contains(o.host)) {       hostToExecutors(o.host) = new HashSet[String]()     }     if (!executorIdToRunningTaskIds.contains(o.executorId)) {       hostToExecutors(o.host) += o.executorId       executorAdded(o.executorId, o.host)       executorIdToHost(o.executorId) = o.host       executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()       newExecAvail = true     }     for (rack <- getRackForHost(o.host)) {       hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host     }   }       // Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do   // this here to avoid a separate thread and added synchronization overhead, and also because   // updating the blacklist is only relevant when task offers are being made.   blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())       val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>     offers.filter { offer =>       !blacklistTracker.isNodeBlacklisted(offer.host) &&         !blacklistTracker.isExecutorBlacklisted(offer.executorId)     }   }.getOrElse(offers)       val shuffledOffers = shuffleOffers(filteredOffers)   // Build a list of tasks to assign to each worker.   val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))   val availableCpus = shuffledOffers.map(o => o.cores).toArray   val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum   val sortedTaskSets = rootPool.getSortedTaskSetQueue   for (taskSet <- sortedTaskSets) {     logDebug("parentName: %s, name: %s, runningTasks: %s".format(       taskSet.parent.name, taskSet.name, taskSet.runningTasks))     if (newExecAvail) {       taskSet.executorAdded()     }   }       // Take each TaskSet in our scheduling order, and then offer it each node in increasing order   // of locality levels so that it gets a chance to launch local tasks on all of them.   // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY   for (taskSet <- sortedTaskSets) {     // Skip the barrier taskSet if the available slots are less than the number of pending tasks.     if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {       // Skip the launch process.       // TODO SPARK-24819 If the job requires more slots than available (both busy and free       // slots), fail the job on submit.       logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +         s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " +         s"number of available slots is $availableSlots.")     } else {       var launchedAnyTask = false       // Record all the executor IDs assigned barrier tasks on.       val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()       for (currentMaxLocality <- taskSet.myLocalityLevels) {         var launchedTaskAtCurrentMaxLocality = false         do {           launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,             currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)           launchedAnyTask |= launchedTaskAtCurrentMaxLocality         } while (launchedTaskAtCurrentMaxLocality)       }           if (!launchedAnyTask) {         taskSet.getCompletelyBlacklistedTaskIfAny(hostToExecutors).foreach { taskIndex =>             // If the taskSet is unschedulable we try to find an existing idle blacklisted             // executor. If we cannot find one, we abort immediately. Else we kill the idle             // executor and kick off an abortTimer which if it doesn't schedule a task within the             // the timeout will abort the taskSet if we were unable to schedule any task from the             // taskSet.             // Note 1: We keep track of schedulability on a per taskSet basis rather than on a per             // task basis.             // Note 2: The taskSet can still be aborted when there are more than one idle             // blacklisted executors and dynamic allocation is on. This can happen when a killed             // idle executor isn't replaced in time by ExecutorAllocationManager as it relies on             // pending tasks and doesn't kill executors on idle timeouts, resulting in the abort             // timer to expire and abort the taskSet.             executorIdToRunningTaskIds.find(x => !isExecutorBusy(x._1)) match {               case Some ((executorId, _)) =>                 if (!unschedulableTaskSetToExpiryTime.contains(taskSet)) {                   blacklistTrackerOpt.foreach(blt => blt.killBlacklistedIdleExecutor(executorId))                       val timeout = conf.get(config.UNSCHEDULABLE_TASKSET_TIMEOUT) * 1000                   unschedulableTaskSetToExpiryTime(taskSet) = clock.getTimeMillis() + timeout                   logInfo(s"Waiting for $timeout ms for completely "                     + s"blacklisted task to be schedulable again before aborting $taskSet.")                   abortTimer.schedule(                     createUnschedulableTaskSetAbortTimer(taskSet, taskIndex), timeout)                 }               case None => // Abort Immediately                 logInfo("Cannot schedule any task because of complete blacklisting. No idle" +                   s" executors can be found to kill. Aborting $taskSet." )                 taskSet.abortSinceCompletelyBlacklisted(taskIndex)             }         }       } else {         // We want to defer killing any taskSets as long as we have a non blacklisted executor         // which can be used to schedule a task from any active taskSets. This ensures that the         // job can make progress.         // Note: It is theoretically possible that a taskSet never gets scheduled on a         // non-blacklisted executor and the abort timer doesn't kick in because of a constant         // submission of new TaskSets. See the PR for more details.         if (unschedulableTaskSetToExpiryTime.nonEmpty) {           logInfo("Clearing the expiry times for all unschedulable taskSets as a task was " +             "recently scheduled.")           unschedulableTaskSetToExpiryTime.clear()         }       }           if (launchedAnyTask && taskSet.isBarrier) {         // Check whether the barrier tasks are partially launched.         // TODO SPARK-24818 handle the assert failure case (that can happen when some locality         // requirements are not fulfilled, and we should revert the launched tasks).         require(addressesWithDescs.size == taskSet.numTasks,           s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +             s"because only ${addressesWithDescs.size} out of a total number of " +             s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " +             "been blacklisted or cannot fulfill task locality requirements.")             // materialize the barrier coordinator.         maybeInitBarrierCoordinator()             // Update the taskInfos into all the barrier task properties.         val addressesStr = addressesWithDescs           // Addresses ordered by partitionId           .sortBy(_._2.partitionId)           .map(_._1)           .mkString(",")         addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))             logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +           s"stage ${taskSet.stageId}.")       }     }   }       // TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get   // launched within a configured time.   if (tasks.size > 0) {     hasLaunchedTask = true   }   return tasks }         private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {   for (task <- tasks.flatten) {     val serializedTask = TaskDescription.encode(task)     if (serializedTask.limit() >= maxRpcMessageSize) {       Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>         try {           var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +             "spark.rpc.message.maxSize (%d bytes). Consider increasing " +             "spark.rpc.message.maxSize or using broadcast variables for large values."           msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)           taskSetMgr.abort(msg)         } catch {           case e: Exception => logError("Exception in error callback", e)         }       }     }     else {       val executorData = executorDataMap(task.executorId)       executorData.freeCores -= scheduler.CPUS_PER_TASK           logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +         s"${executorData.executorHost}.")           executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))     }   } }              
上一篇:爬取招聘网站符合关键字的网址


下一篇:Qt文件路径:QDir