Documentation
¶
Overview ¶
Package util is using for the total variable.
Package util is using for the total variable.
Package util is using for the total variable.
Package util is using for the total variable.
Package util is using for the total variable.
Index ¶
- Constants
- func ChangeIntArrToStr(top []int, npuCardPreName string) string
- func ChangeNodesToNodeMaps(nodes []*api.NodeInfo) map[string]*api.NodeInfo
- func ChangeTopToIntArray(topStr string, npuCardPreName string) []int
- func CheckPodNameOrSpace(checkItem, podParam string, maxLength int) error
- func CheckStrInSlice(str string, slice []string) bool
- func ConvertErrSliceToError(reErrors []error) error
- func GetActivePodUsedDevFromNode(nodeInfo *api.NodeInfo, devType string) []string
- func GetAvailableDevInfo(devList map[string]string) (string, []string)
- func GetDeviceType(devList map[string]string) string
- func GetNodeDevListFromAnno(nodeInfo *api.NodeInfo) ([]string, error)
- func GetNpuNameFromJobRequire(npuName string) string
- func GetRecoveringDevInfo(devList map[string]string) (string, []string)
- func GetTaskInfoByNameFromSSN(ssn *framework.Session, taskName, taskNamespace string) (*api.TaskInfo, error)
- func GetUnhealthyDevInfo(devList map[string]string) (string, []string)
- func GetVTaskUseTemplate(taskInf *api.TaskInfo) (string, error)
- func IsMapHasNPUResource(resMap map[v1.ResourceName]float64, npuName string) bool
- func IsNPUTask(nT *api.TaskInfo) bool
- func IsNodeReady(node *v1.Node) bool
- func IsSliceContain(keyword interface{}, targetSlice interface{}) bool
- func IsStrategyInSubHealthyStrategs(subHealthyStrategy string) bool
- func MakeDataHash(data interface{}) string
- func Max(x, y int) int
- func Min(x, y int) int
- func PtrInit[T any](v T) *T
- func ReferenceNameOfJob(job *api.JobInfo) string
- func RemoveCommonElement(s1, s2 []int) []int
- func RemoveSliceDuplicateElement(languages []string) []string
- func SafePrint(args ...interface{}) string
- func SortByNumericValue(s []string)
- func UuidOfJob(job *api.JobInfo) types.UID
- type ComJob
- type Device
- type Instance
- type NPUJob
- type NPUTask
- func (asTask *NPUTask) DeleteRealPodByTask(ssn *framework.Session, waitTime int64) error
- func (asTask *NPUTask) EvictJobByTask(ssn *framework.Session, reason string, taskName string) error
- func (asTask *NPUTask) ForceDeletePodByTaskInf(ssn *framework.Session, reason string, nodeName string) error
- func (asTask *NPUTask) InitVTask(taskInf *api.TaskInfo) error
- func (asTask *NPUTask) IsNPUTask() bool
- func (asTask *NPUTask) IsTaskInItsNode(ssn *framework.Session, nodeName string) bool
- func (asTask *NPUTask) IsVNPUTask() bool
- func (asTask *NPUTask) UpdatePodPendingReason(taskInfo *api.TaskInfo, reasonTmp string) error
- type NpuBaseInfo
- type SchedulerJobAttr
- type TaskAllocated
- type VResource
- type VTask
- type VTemplate
Constants ¶
const ( // ChipKind is the prefix of npu resource. ChipKind = "910" // HwPreName is the prefix of npu resource. HwPreName = "huawei.com/" // NPU910CardName for judge 910 npu resource. NPU910CardName = "huawei.com/Ascend910" // NPU910CardNamePre for getting card number. NPU910CardNamePre = "Ascend910-" // NPU310PCardName for judge 310P npu resource. NPU310PCardName = "huawei.com/Ascend310P" // NPU310CardName for judge 310 npu resource. NPU310CardName = "huawei.com/Ascend310" // NPU310CardNamePre for getting card number. NPU310CardNamePre = "Ascend310-" // NPU310PCardNamePre for getting card number. NPU310PCardNamePre = "Ascend310P-" // AscendNPUPodRealUse for NPU pod real use cards. AscendNPUPodRealUse = "huawei.com/AscendReal" // AscendNPUCore for NPU core num, like 56; Records the chip name that the scheduler assigns to the pod. AscendNPUCore = "huawei.com/npu-core" // Ascend910bName for judge Ascend910b npu resource. Ascend910bName = "huawei.com/Ascend910b" // Ascend310P device type 310P Ascend310P = "Ascend310P" // Ascend310 device type 310 Ascend310 = "Ascend310" // Ascend910 device type 910 Ascend910 = "Ascend910" // Pod910DeviceKey pod annotation key, for generate 910 hccl rank table Pod910DeviceKey = "ascend.kubectl.kubernetes.io/ascend-910-configuration" // JobKind910Value in ring-controller.atlas. JobKind910Value = "ascend-910" // JobKind310Value in ring-controller.atlas. JobKind310Value = "ascend-310" // JobKind310PValue 310p ring controller name JobKind310PValue = "ascend-310P" // JobKind910BValue 910B ring controller name JobKind910BValue = "ascend-910b" // Module910bx16AcceleratorType for module mode. Module910bx16AcceleratorType = "module-910b-16" // Module910bx8AcceleratorType for module mode. Module910bx8AcceleratorType = "module-910b-8" // Accelerator310Key accelerator key of old infer card Accelerator310Key = "npu-310-strategy" )
const ( TaskStatusInit = iota TaskStatusAllocate TaskStatusWrBack TaskStatusRunning TaskStatusFailed )
for task status
const ( // LogErrorLev for log error. LogErrorLev = 1 // LogWarningLev for log warning. LogWarningLev = 2 // LogInfoLev for log information. LogInfoLev = 3 // LogDebugLev for log debug. LogDebugLev = 4 // ErrorInt return -1 when get error for int ErrorInt = -1 // NPUIndex2 the 2 index. NPUIndex2 = 2 // NPUIndex3 the 3 index. NPUIndex3 = 3 // NPUIndex8 the 8 index. NPUIndex8 = 8 // NPUIndex16 the 16 index. NPUIndex16 = 16 // NPUIndex7 the 7 index. NPUIndex7 = 7 // NPUIndex4 the 4 index. NPUIndex4 = 4 // NPUIndex5 the 5 index. NPUIndex5 = 5 // NPUIndex6 the 6 index. NPUIndex6 = 6 // NPUIndex1 the 1 index. NPUIndex1 = 1 // NPUIndex0 the 0 index. NPUIndex0 = 0 // NPUIndex9 the 9 index. NPUIndex9 = 9 // NPUIndex10 the 10 index. NPUIndex10 = 10 // NPUIndex11 the 11 index. NPUIndex11 = 11 // NPUIndex12 the 12 index. NPUIndex12 = 12 // NPUIndex13 the 13 index. NPUIndex13 = 13 // NPUIndex14 the 14 index. NPUIndex14 = 14 // NPUIndex15 the 15 index. NPUIndex15 = 15 // CoreNum32 32 core 910 CoreNum32 = 32 // CoreNum3 3 core 910 CoreNum3 = 3 // CoreNum5 5 core 910 CoreNum5 = 5 // CoreNum10 10 core 910 CoreNum10 = 10 // CoreNum6 6 core 910 CoreNum6 = 6 // CoreNum12 12 core 910 CoreNum12 = 12 // CoreNum30 30 core 910 CoreNum30 = 30 // CoreNum20 20 core 910 CoreNum20 = 20 // CoreNum25 25 core 910 CoreNum25 = 25 // CoreNum24 24 core 910 CoreNum24 = 24 // CpuNum14 14 cpu 910 CpuNum14 = 14 // CpuNum6 6 cpu 910 CpuNum6 = 6 // MapInitNum for map init length. MapInitNum = 3 // Base10 for const 10. Base10 = 10 // BitSize64 for const 64 BitSize64 = 64 // MaxSliceNum max slice number MaxSliceNum = 128 // NPUHexKilo for const 1000,volcano frame used. NPUHexKilo = 1000 // Accelerator for custom tag. Accelerator = "accelerator" // CMInitParamKey init param key in scheduler configmap CMInitParamKey = "init-params" // AcceleratorType for selector. AcceleratorType = "accelerator-type" // ModuleA3x16AcceleratorType for module mode. ModuleA3x16AcceleratorType = "module-a3-16" // ModuleAcceleratorType for module mode. ModuleAcceleratorType = "module" // ServerType server type value takes Ascend310P-10-dual/Ascend910-32... ServerType = "servertype" // ServerTypeDual dual card ServerTypeDual = "dual" // SegmentEnable for VNPU segment enable flag. Default is "false". SegmentEnable = "presetVirtualDevice" // UseClusterInfoManager for use cluster info manager, default is true UseClusterInfoManager = "useClusterInfoManager" // ForceEnqueue for force enqueue when npu num in cluster is meet job required , default is true ForceEnqueue = "forceEnqueue" // SelfMaintainAvailCard for volcano self maintain available card, default is true SelfMaintainAvailCard = "self-maintain-available-card" // SubHealthyStrategyLabel sub-healthy handle strategy. default is grace exit SubHealthyStrategyLabel = "subHealthyStrategy" // SubHealthyIgnore ignore sub-healthy SubHealthyIgnore = "ignore" // SubHealthyGraceExit don't use sub-healthy node and grace exit SubHealthyGraceExit = "graceExit" // SubHealthyForceExit don't use sub-healthy node and force exit SubHealthyForceExit = "forceExit" // SubHealthyHotSwitch strategy name of hot switch SubHealthyHotSwitch = "hotSwitch" // DevInfoNameSpace device-plugin install Namespace DevInfoNameSpace = "kube-system" // MindXDlNameSpace mindx dl Namespace MindXDlNameSpace = "mindx-dl" // DevInfoPreName like "mindx-dl-deviceinfo-ubuntu" DevInfoPreName = "mindx-dl-deviceinfo-" // NodeDCmInfoNamePrefix is for noded to report node healthy state NodeDCmInfoNamePrefix = "mindx-dl-nodeinfo-" // SwitchCmInfoNamePrefix is the prefix for switch fault configmap SwitchCmInfoNamePrefix = "mindx-dl-switchinfo-" // NodeHealthyStatusKey is the key of node healthy status from configmap data of noded and clusterD NodeHealthyStatusKey = "NodeHealthyStatus" // NodeSubHealthy means there is some fault on the node which is reported by nodeD, but will not immediately // make node unhealthy, this status will prevent new task schduled on this node and reschedule will not consider // this node NodeSubHealthy = "SubHealthy" // NodeUnHealthy is the node unhealthy status reported by nodeD configmap, switch info CM and clusterD, // in this case pod will be rescheduling NodeUnHealthy = "UnHealthy" // NodeHealthyByNodeD is the node healthy status reported by nodeD configmap NodeHealthyByNodeD = "Healthy" // NodeDEnableKey indicates if the label has been set NodeDEnableKey = "nodeDEnable" // NodeDEnableOnValue the value of NodeDEnableKey, which means nodeD has been enabled NodeDEnableOnValue = "on" // PreSeparateFaultCode PreSeparate fault Code PreSeparateFaultCode = "PreSeparate" // SwitchNodeHealtyStatuskey same with noded there will be healthy subhealthy unhealthy status report by switch info SwitchNodeHealtyStatuskey = "NodeStatus" // DevInfoCMKey mindx-dl-deviceinfo configmap key DevInfoCMKey = "DeviceInfoCfg" // NodeInfoCMKey node info configmap key NodeInfoCMKey = "NodeInfo" // SwitchInfoCmKey is the key of switch info configmap SwitchInfoCmKey = "SwitchInfoCfg" // RePropertyCacheName rescheduling keyword in init env.cache RePropertyCacheName = "re-scheduling" // CmCheckCode Check code key CmCheckCode = "checkCode" // JobRecovery keywords for retain JobRecovery = "job-recovery" // DeleteOperator informer delete operator DeleteOperator = "delete" // AddOperator informer add operator AddOperator = "add" // UpdateOperator informer update operator UpdateOperator = "update" // CmConsumer who uses these configmap CmConsumer = "mx-consumer-volcano" // NormalCmConsumer normal who uses these configmap NormalCmConsumer = "mx-consumer-cim" // CmConsumerValue the value only for true CmConsumerValue = "true" // ClusterDeviceInfo the name of cluster device info configmap ClusterDeviceInfo = "cluster-info-device-" // ClusterNodeInfo the name of cluster node info configmap ClusterNodeInfo = "cluster-info-node-cm" // ClusterSwitchInfo the name of cluster switch info configmap ClusterSwitchInfo = "cluster-info-switch-" // PodPredicateTime set pod PodPredicateTime for using by device-plugin. PodPredicateTime = "predicate-time" // NodeNotMeetTopologyWarning node not satisfy the schedulable topology warning. NodeNotMeetTopologyWarning = "the npus on this node don't satisfy the schedulable topology" // ArgumentError argument nil error. ArgumentError = "invalid argument" // RankIdNotExistError rank id does not exist RankIdNotExistError = "rank id does not exist" // JobKindKey for define the Job kind:ascend-310P, ascend-910 JobKindKey = "ring-controller.atlas" // DistributedJobKey flag for distributed job DistributedJobKey = "distributed-job" // DistributedJobValue indicate distributed job DistributedJobValue = "true" // StandaloneJobValue indicate standalone job StandaloneJobValue = "false" // SuperPodAnnoKey annotation key of super pod SuperPodAnnoKey = "sp-block" // DistributedInferKey distributed infer DistributedInferKey = "distributed" // DistributedInferLabel true or false DistributedInferLabel = "true" // OperatorNameLabelKey pod label key for acjob operator name OperatorNameLabelKey = "training.kubeflow.org/operator-name" )
const ( // AffScore0 value 0 for scored. AffScore0 = iota // AffScore1 value 1 for scored. AffScore1 // AffScore2 value 2 for scored. AffScore2 // AffScore3 value 3 for scored. AffScore3 // AffScore4 value 4 for scored. AffScore4 // AffScore5 value 5 for scored. AffScore5 // AffScore6 value 6 for scored. AffScore6 // AffScore7 value 7 for scored. AffScore7 // AffScore8 value 8 for scored. AffScore8 // AffScore15 value 15 for scored. AffScore15 )
const ( // JobNotEnqueue job enqueue failed JobNotEnqueue = -1 // JobEnqueue job enqueue success JobEnqueue = 1 // JobEnqueueSkip skip the judgement of ascend-volcano-plugin in the job enqueue phase JobEnqueueSkip = 0 // PodGroupInqueue the pg Inqueue status PodGroupInqueue = "Inqueue" // PodGroupPending the pg Pending status PodGroupPending = "Pending" // PodGroupRunning the pg Running status PodGroupRunning = "Running" // PodGroupUnknown the pg Unknown status PodGroupUnknown = "Unknown" // PodGroupUnschedulableType the pg Unschedulable Condition PodGroupUnschedulableType = "Unschedulable" // EnableFunc enable the function EnableFunc = "on" // SinglePodTag the tag of single pod rescheduling SinglePodTag = "pod-rescheduling" // ProcessRecoverEnable the tag of process rescheduling ProcessRecoverEnable = "process-recover-enable" // BaseDeviceInfoKey base device info key BaseDeviceInfoKey = "baseDeviceInfos" // TorAffinityKey the key of tor affinity TorAffinityKey = "tor-affinity" // LargeModelTag the value of large model LargeModelTag = "large-model-schema" // NormalSchema the value of normal tor affinity NormalSchema = "normal-schema" // NullTag the value means not use tor affinity NullTag = "null" // DevSplitNum device split number DevSplitNum = 2 )
const ( // SeparateFaultStrategy Separate task SeparateFaultStrategy = "Separate" // SubHealthFaultStrategy SubHealth task SubHealthFaultStrategy = "SubHealth" // RelationFault fault type of relation fault RelationFault = "RelationFaultSeparate" )
const ( // Permit indicates permits job to be pipelined Permit = 1 // Abstain indicates abstains in voting job to be pipelined Abstain = 0 // Reject indicates rejects job to be pipelined Reject = -1 )
const ( // Namespace check item podName namespace Namespace = "namespace" // PodName check item podName PodName = "podName" // PodNameMaxLength pod name max length PodNameMaxLength = 253 // PodNameSpaceMaxLength pod namespace max length PodNameSpaceMaxLength = 63 // PodAnnotationMaxLength pod annotation max data length 1MB PodAnnotationMaxLength = 1024 * 1024 // MaxDevicesNum max device num MaxDevicesNum = 100 )
const ( // AppTypeLabelKey job type label key AppTypeLabelKey = "app" // ControllerAppType controller app type ControllerAppType = "mindie-ms-controller" // CoordinatorAppType coordinator app type CoordinatorAppType = "mindie-ms-coordinator" )
const ( // NeedVolcanoOpeKey for volcano NeedVolcanoOpeKey = "needVolcanoOpe" // OpeTypeDelete operation type delete OpeTypeDelete = "delete" )
HotSwitch
const (
// Rank0 default time of pod deleted
Rank0 = "0"
)
Variables ¶
This section is empty.
Functions ¶
func ChangeIntArrToStr ¶
ChangeIntArrToStr Covert []int to string. Like [0,1] -> "Ascend910-0,Ascend910-1".
func ChangeNodesToNodeMaps ¶
ChangeNodesToNodeMaps change nodes slice into node maps
func ChangeTopToIntArray ¶
ChangeTopToIntArray Change npu card ids from string to int array.
func CheckPodNameOrSpace ¶
CheckPodNameOrSpace check pod name or pod namespace
func CheckStrInSlice ¶
CheckStrInSlice return whether str in string slice
func ConvertErrSliceToError ¶
ConvertErrSliceToError convert []error to one error.
func GetActivePodUsedDevFromNode ¶
GetActivePodUsedDevFromNode get active pod used device from node
func GetAvailableDevInfo ¶
GetAvailableDevInfo get available device info from device list
func GetDeviceType ¶
GetDeviceType get device type from dev list
func GetNodeDevListFromAnno ¶
GetNodeDevListFromAnno get node device list from annotation
func GetNpuNameFromJobRequire ¶
GetNpuNameFromJobRequire get npuName,if job require name is npu-core return huawei.com/Ascend310P
func GetRecoveringDevInfo ¶
GetRecoveringDevInfo get recovering device info from device list
func GetTaskInfoByNameFromSSN ¶
func GetTaskInfoByNameFromSSN(ssn *framework.Session, taskName, taskNamespace string) (*api.TaskInfo, error)
GetTaskInfoByNameFromSSN get corresponding api.TaskInfo object by given taskName
func GetUnhealthyDevInfo ¶
GetUnhealthyDevInfo get unhealthy device info from device list
func GetVTaskUseTemplate ¶
GetVTaskUseTemplate the format is : 0-vir04-3c_ndvpp,0-vir0
func IsMapHasNPUResource ¶
IsMapHasNPUResource Determines whether a target string exists in the map.
func IsNPUTask ¶
IsNPUTask to judge the task either is NPU task or not.
func IsSliceContain ¶
func IsSliceContain(keyword interface{}, targetSlice interface{}) bool
IsSliceContain judges whether keyword in targetSlice
func IsStrategyInSubHealthyStrategs ¶
IsStrategyInSubHealthyStrategs to judge the subHealthyStrategy is in subHealthyStrategs or not.
func MakeDataHash ¶
func MakeDataHash(data interface{}) string
MakeDataHash check code for configmap
func ReferenceNameOfJob ¶
ReferenceNameOfJob get name of job
func RemoveCommonElement ¶
RemoveCommonElement remove common element from s1
func RemoveSliceDuplicateElement ¶
RemoveSliceDuplicateElement remove duplicate element in slice
Types ¶
type ComJob ¶
type ComJob struct {
Name api.JobID
ReferenceName string
NameSpace string
Status string
Annotation map[string]string
Selector map[string]string
Label map[string]string
}
ComJob all vcJob has.
type Device ¶
type Device struct {
DeviceID string `json:"device_id"` // device id
DeviceIP string `json:"device_ip"` // device ip
SuperDeviceID string `json:"super_device_id,omitempty"`
}
Device id for Instcance
type Instance ¶
type Instance struct {
PodName string `json:"pod_name"` // pod Name
ServerID string `json:"server_id"` // serverdId
SuperPodId int32 `json:"super_pod_id"`
Devices []Device `json:"devices"` // dev
}
Instance is for annotation
type NPUJob ¶
type NPUJob struct {
// the mapKey is taskID, not Name.
Tasks map[api.TaskID]NPUTask
NPUTaskNum int
SchedulingTaskNum int
ReqNPUName string
ReqNPUNum int
SpBlockNPUNum int
SubHealthyStrategy string
}
NPUJob only npu vcJob have.
func (*NPUJob) GetSchedulingTaskNum ¶
GetSchedulingTaskNum get the num of scheduling task
func (*NPUJob) IsNPUJob ¶
IsNPUJob Determine whether is the NPU job. Dynamic segmentation: huawei.com/npu-core. static segmentation: huawei.com/Ascend910-Y. no segmentation: huawei.com/Ascend910.
type NPUTask ¶
type NPUTask struct {
Name string
NameSpace string
ReqNPUName string
ReqNPUNum int
Annotation map[string]string
Label map[string]string
NodeName string
PodStatus v1.PodPhase
Index int
*VTask
}
NPUTask for npu task need.
func (*NPUTask) DeleteRealPodByTask ¶
DeleteRealPodByTask generally used by force deletion
func (*NPUTask) EvictJobByTask ¶
EvictJobByTask generally used by grace deletion
func (*NPUTask) ForceDeletePodByTaskInf ¶
func (asTask *NPUTask) ForceDeletePodByTaskInf(ssn *framework.Session, reason string, nodeName string) error
ForceDeletePodByTaskInf Force delete pod by taskInf.
func (*NPUTask) InitVTask ¶
InitVTask init vNPU task.
func (*NPUTask) IsNPUTask ¶
IsNPUTask Determine whether is the NPU task. Dynamic segmentation: huawei.com/npu-core. static segmentation: huawei.com/Ascend910-Y. no segmentation: huawei.com/Ascend910.
func (*NPUTask) IsTaskInItsNode ¶
IsTaskInItsNode check if task is on the node
func (*NPUTask) IsVNPUTask ¶
IsVNPUTask Determine whether is the NPU virtual task. Dynamic segmentation: huawei.com/npu-core. no segmentation: huawei.com/Ascend910.
type NpuBaseInfo ¶
NpuBaseInfo npu base info
type SchedulerJobAttr ¶
SchedulerJobAttr vcJob's attribute.
func (SchedulerJobAttr) GetPluginNameByReq ¶
func (sJob SchedulerJobAttr) GetPluginNameByReq() string
GetPluginNameByReq get plugin name by job request resource name
func (*SchedulerJobAttr) IsJobHasTorAffinityLabel ¶
func (sJob *SchedulerJobAttr) IsJobHasTorAffinityLabel() bool
IsJobHasTorAffinityLabel check job has tor affinity label
func (SchedulerJobAttr) IsLargeModelJob ¶
func (sJob SchedulerJobAttr) IsLargeModelJob() bool
IsLargeModelJob job is large model job
func (*SchedulerJobAttr) IsTorAffinityJob ¶
func (sJob *SchedulerJobAttr) IsTorAffinityJob() bool
IsTorAffinityJob check job is tor affinity job
type TaskAllocated ¶
type TaskAllocated struct {
// like ubuntu
NodeName string
// element like 1
CardName []int
// element like Ascend310P-2c-100-1
PhysicsName []string
}
TaskAllocated Task allocated struct.
type VResource ¶
VResource resource dimensions
func (VResource) BeGreater ¶
BeGreater judge resource greater or equal to
type VTask ¶
type VTask struct {
// TASK_STATUS_INIT...
Status int
Allocated TaskAllocated
}
VTask virtual NPU task struct.
Source Files
¶
- constants.go
- job.go
- task.go
- type.go
- util.go