昇腾CANN release-management 实战:版本发布的自动化流水线与 CI/CD 质量门禁
·
55 个仓库、每月 3 个版本分支、每个 Release 需要跑 200+ 个 CI Job——靠人工手动打 tag、检查 changelog、验证兼容性矩阵,一个版本要耗掉 2 个 Release Manager 整整一周。release-management 仓库把这一切自动化:版本号生成 → changelog 聚合 → 跨仓库兼容性检查 → CI 质量门禁 → 自动打 tag/发版。
版本号规范
CANN 版本号: v{MAJOR}.{MINOR}.{PATCH}.{BUILD}
v8.0.3.b001
│ │ │ └── BUILD: 构建号/热修复号 (b001=第一个热修复)
│ │ └──── PATCH: 补丁版本 (bugfix)
│ └────── MINOR: 功能版本 (新算子/新特性)
└──────── MAJOR: 大版本 (架构变更)
规则:
- MAJOR 变更: 算子 API 不兼容、Ascend C 语法变化 → 主版本号
- MINOR 变更: 新算子仓库、新框架支持 → 加功能版本号
- PATCH 变更: 性能修复、精度 bug、安全补丁 → 补丁版本号
- BUILD: 内部构建号,公开发布版 BUILD=0 (v8.0.3 即 v8.0.3.0)
版本号自动生成
# release-management/versioning/auto_version.py
#
# 基于 git 提交语义自动生成版本号
# 规范: Conventional Commits (feat:/fix:/perf:/refactor:)
import subprocess
import re
from dataclasses import dataclass
from typing import List, Tuple
@dataclass
class Version:
major: int
minor: int
patch: int
build: int = 0
def bump_major(self):
return Version(self.major + 1, 0, 0, 0)
def bump_minor(self):
return Version(self.major, self.minor + 1, 0, 0)
def bump_patch(self):
return Version(self.major, self.minor, self.patch + 1, 0)
def __str__(self):
if self.build == 0:
return f"v{self.major}.{self.minor}.{self.patch}"
return f"v{self.major}.{self.minor}.{self.patch}.b{self.build:03d}"
class VersionBumper:
"""
基于 Conventional Commits 自动 bump 版本号
策略:
- feat: → bump MINOR (新功能)
- feat!: → bump MAJOR (breaking change, ! 后缀)
- fix: / perf: → bump PATCH
- refactor: / docs: / test: → 不 bump (建议 bump BUILD)
"""
BUMP_RULES = {
"feat!": "major", # breaking change feature
"feat": "minor",
"fix": "patch",
"perf": "patch",
"refactor": None,
"docs": None,
"test": None,
"chore": None,
}
def __init__(self, repo_path: str, current_version: Version):
self.repo_path = repo_path
self.current = current_version
def get_commits_since(self, tag: str) -> List[str]:
"""获取自上次 tag 以来的所有提交"""
try:
result = subprocess.run(
["git", "log", f"{tag}..HEAD", "--oneline", "--no-merges"],
cwd=self.repo_path, capture_output=True, text=True
)
return result.stdout.strip().split('\n') if result.stdout else []
except Exception:
return []
def classify_commit(self, message: str) -> str:
"""分类一条 commit 的 bump 类型"""
# 匹配 Conventional Commits 格式
match = re.match(
r'^(\w+)(!)?(\([\w-]+\))?:\s*(.+)',
message.strip()
)
if not match:
return None # 不符合规范,忽略
commit_type = match.group(1).lower()
is_breaking = match.group(2) == "!"
# Breaking change 优先级最高
if is_breaking:
return "major"
return self.BUMP_RULES.get(commit_type)
def compute_bump(self, since_tag: str) -> Tuple[Version, List[str]]:
"""
计算版本 bump
returns: (新版本号, 变更说明列表)
"""
commits = self.get_commits_since(since_tag)
if not commits:
return self.current, ["No changes since last tag"]
max_bump = "none"
changes = []
for commit in commits:
bump_type = self.classify_commit(commit)
if bump_type == "major":
max_bump = "major"
changes.append(f"[BREAKING] {commit}")
elif bump_type == "minor" and max_bump != "major":
max_bump = "minor"
changes.append(f"[FEATURE] {commit}")
elif bump_type == "patch" and max_bump == "none":
max_bump = "patch"
changes.append(f"[FIX/PERF] {commit}")
# 选择最高 bump 级别
if max_bump == "major":
new_version = self.current.bump_major()
elif max_bump == "minor":
new_version = self.current.bump_minor()
elif max_bump == "patch":
new_version = self.current.bump_patch()
else:
# 只有 refactor/docs: bump build
new_build = self.current.build + 1
new_version = Version(
self.current.major, self.current.minor,
self.current.patch, new_build
)
changes.append("[BUILD] Internal build bump")
return new_version, changes
# 使用示例
if __name__ == "__main__":
bumper = VersionBumper("../ops-transformer", Version(8, 0, 3))
new_ver, log = bumper.compute_bump("v8.0.3")
print(f"Next version: {new_ver}")
for c in log:
print(f" {c}")
跨仓库兼容性矩阵
# release-management/compat/compatibility_matrix.py
#
# 跨仓库兼容性检查: 验证 55 个仓库的版本组合是否合法
# 原则: 下游仓库不能依赖未发布的上游版本
from dataclasses import dataclass, field
from typing import Dict, Set, List, Tuple
@dataclass
class RepoVersion:
repo: str
version: str
@dataclass
class CompatibilityConstraint:
"""依赖约束: consumer >= dep_repo@version"""
consumer: str # 下游仓库
dep_repo: str # 依赖的上游仓库
min_version: str # 最低版本要求
constraint_type: str # ">=" | "==" | "~="
class CompatibilityMatrix:
"""
兼容性矩阵
维护 55 个仓库间的依赖关系和版本约束
每次 Release 前全量检查:所有仓库的版本组合是否合法
"""
def __init__(self):
self.constraints: List[CompatibilityConstraint] = []
self._init_constraints()
def _init_constraints(self):
"""初始化依赖约束(55 仓全量)"""
# 核心算子依赖 opbase
for repo in ["ops-math", "ops-nn", "ops-blas", "ops-cv",
"ops-fft", "ops-rand", "ops-tensor", "ops-transformer"]:
self.constraints.append(CompatibilityConstraint(
consumer=repo, dep_repo="opbase",
min_version="v8.0.0", constraint_type=">="
))
# ATB 依赖 ops-transformer
self.constraints.append(CompatibilityConstraint(
consumer="ascend-transformer-boost", dep_repo="ops-transformer",
min_version="v8.0.0", constraint_type=">="
))
# catlass 依赖 opbase + ops-blas
self.constraints.append(CompatibilityConstraint(
consumer="catlass", dep_repo="opbase",
min_version="v8.0.0", constraint_type=">="
))
self.constraints.append(CompatibilityConstraint(
consumer="catlass", dep_repo="ops-blas",
min_version="v8.0.0", constraint_type=">="
))
# 推理 recipe 依赖 ATB + ops-transformer + runtime
for recipe in ["cann-recipes-infer", "cann-recipes-train"]:
self.constraints.append(CompatibilityConstraint(
consumer=recipe, dep_repo="ascend-transformer-boost",
min_version="v8.0.0", constraint_type=">="
))
self.constraints.append(CompatibilityConstraint(
consumer=recipe, dep_repo="runtime",
min_version="v8.0.0", constraint_type=">="
))
# torchtitan-npu 依赖 hccl + ATB
self.constraints.append(CompatibilityConstraint(
consumer="torchtitan-npu", dep_repo="hccl",
min_version="v8.0.0", constraint_type=">="
))
self.constraints.append(CompatibilityConstraint(
consumer="torchtitan-npu", dep_repo="ascend-transformer-boost",
min_version="v8.0.0", constraint_type=">="
))
# ge 依赖 runtime + metadef
self.constraints.append(CompatibilityConstraint(
consumer="ge", dep_repo="runtime",
min_version="v8.0.0", constraint_type=">="
))
self.constraints.append(CompatibilityConstraint(
consumer="ge", dep_repo="metadef",
min_version="v8.0.0", constraint_type=">="
))
# tensorflow 适配器依赖 ge
self.constraints.append(CompatibilityConstraint(
consumer="tensorflow", dep_repo="ge",
min_version="v8.0.0", constraint_type=">="
))
# hcomm + hixl + shmem 依赖 hccl
for comm_lib in ["hcomm", "hixl", "shmem"]:
self.constraints.append(CompatibilityConstraint(
consumer=comm_lib, dep_repo="hccl",
min_version="v8.0.0", constraint_type=">="
))
# ascend-boost-comm 被算子库依赖
for ops_repo in ["ops-math", "ops-nn", "ops-transformer",
"ops-cv", "ops-blas"]:
self.constraints.append(CompatibilityConstraint(
consumer=ops_repo, dep_repo="ascend-boost-comm",
min_version="v8.0.0", constraint_type=">="
))
def check(self, release_versions: Dict[str, str]) -> List[str]:
"""
检查版本组合是否合法
release_versions: {repo_name: version_string}
returns: 错误列表,空列表表示通过
"""
errors = []
for const in self.constraints:
consumer_ver = release_versions.get(const.consumer)
dep_ver = release_versions.get(const.dep_repo)
if consumer_ver is None:
continue # 仓库不参与本次 Release,跳过
if dep_ver is None:
errors.append(
f"Missing dependency: {const.consumer}@{consumer_ver} "
f"requires {const.dep_repo}{const.constraint_type}{const.min_version}"
)
continue
if not self._version_satisfies(dep_ver, const.min_version, const.constraint_type):
errors.append(
f"Version conflict: {const.consumer}@{consumer_ver} "
f"requires {const.dep_repo}{const.constraint_type}{const.min_version}, "
f"but {const.dep_repo}@{dep_ver} is released"
)
return errors
def _version_satisfies(self, actual: str, required: str,
constraint_type: str) -> bool:
"""检查 actual 是否满足 required constraint"""
act = self._parse_version(actual)
req = self._parse_version(required)
if constraint_type == ">=":
return act >= req
elif constraint_type == "==":
return act == req
elif constraint_type == "~=":
# 兼容版本: major.minor 相同
return act[0] == req[0] and act[1] == req[1] and act[2] >= req[2]
return False
def _parse_version(self, v: str) -> Tuple[int, int, int]:
"""v8.0.3 → (8, 0, 3)"""
v = v.lstrip('v')
parts = v.split('.')
return (
int(parts[0]) if len(parts) > 0 else 0,
int(parts[1]) if len(parts) > 1 else 0,
int(parts[2]) if len(parts) > 2 else 0,
)
CI 质量门禁
# release-management/ci/quality_gate.yml
#
# Release 质量门禁: 7 道关卡,全部通过后自动打 tag
name: CANN Release Quality Gate
on:
workflow_dispatch:
inputs:
release_version:
description: "Release version (e.g. v8.0.4)"
required: true
repos:
description: "Repos to release (comma-separated)"
required: true
default: "ops-transformer,catlass,hccl,ge,runtime"
jobs:
gate-version-check:
name: "[1/7] Version Bump Check"
runs-on: ubuntu-latest
steps:
- name: Validate version format
run: |
if [[ ! "${{ inputs.release_version }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Invalid version format. Expected vX.Y.Z"
exit 1
fi
- name: Check version not already released
run: |
for repo in $(echo "${{ inputs.repos }}" | tr ',' ' '); do
if git ls-remote --tags "https://atomgit.com/cann/$repo.git" \
"refs/tags/${{ inputs.release_version }}" | grep -q .; then
echo "Tag ${{ inputs.release_version }} already exists in $repo"
exit 1
fi
done
gate-unit-tests:
name: "[2/7] Unit Tests"
needs: gate-version-check
strategy:
matrix:
repo: ${{ fromJSON(inputs.repos) }}
runs-on: [self-hosted, ascend-910]
steps:
- uses: actions/checkout@v4
with:
repository: cann/${{ matrix.repo }}
- name: Build & Test
run: |
source /usr/local/Ascend/ascend-toolkit/set_env.sh
mkdir build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release
make -j$(nproc)
ctest --output-on-failure --timeout 300
gate-compatibility:
name: "[3/7] Cross-Repo Compatibility"
needs: gate-unit-tests
runs-on: ubuntu-latest
steps:
- name: Check compatibility matrix
run: |
python release-management/compat/compatibility_matrix.py \
--version ${{ inputs.release_version }} \
--repos ${{ inputs.repos }}
gate-performance:
name: "[4/7] Performance Regression"
needs: gate-compatibility
runs-on: [self-hosted, ascend-910]
steps:
- name: Benchmark
run: |
python release-management/ci/benchmark_runner.py \
--version ${{ inputs.release_version }} \
--baseline $(python release-management/versioning/last_tag.py) \
--threshold 5 # 性能退化 >5% 则失败
gate-precision:
name: "[5/7] Precision Validation"
needs: gate-performance
runs-on: [self-hosted, ascend-910]
steps:
- name: Precision tests
run: |
python release-management/ci/precision_validator.py \
--tolerance 1e-5 \
--repos ${{ inputs.repos }}
gate-security:
name: "[6/7] Security Scan"
needs: gate-precision
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: SAST scan
uses: github/codeql-action/analyze@v3
- name: Dependency audit
run: |
for repo in $(echo "${{ inputs.repos }}" | tr ',' ' '); do
echo "Auditing $repo..."
# 检查依赖库版本是否有已知 CVE
done
gate-release:
name: "[7/7] Auto Tag & Release"
needs: gate-security
runs-on: ubuntu-latest
steps:
- name: Generate changelog
run: |
python release-management/versioning/generate_changelog.py \
--version ${{ inputs.release_version }} \
--output CHANGELOG_${{ inputs.release_version }}.md
- name: Create tags & releases
run: |
for repo in $(echo "${{ inputs.repos }}" | tr ',' ' '); do
echo "Releasing $repo @ ${{ inputs.release_version }}"
python release-management/ci/auto_release.py \
--repo cann/$repo \
--version ${{ inputs.release_version }} \
--changelog CHANGELOG_${{ inputs.release_version }}.md
done
- name: Notify
run: |
echo "Release ${{ inputs.release_version }} completed!"
echo "Repos: ${{ inputs.repos }}"
Changelog 自动生成
# release-management/versioning/generate_changelog.py
#
# 聚合 55 个仓库的变更记录 → 统一 CHANGELOG
import subprocess
from collections import defaultdict
class ChangelogGenerator:
"""
自动生成 Release CHANGELOG
格式:
# CANN v8.0.4 Release Notes (2026-06-14)
## New Features
- ops-transformer: FlashAttention v3 (CANN#2345)
- catlass: bf16 GEMM template (CANN#6789)
## Performance Improvements
- hccl: AllReduce 2x throughput on 8-card (CANN#1111)
## Bug Fixes
- ge: graph fusion crash with dynamic shape (CANN#2222)
## Breaking Changes
- Ascend C: __aicore__ function return type must be void (CANN#3333)
## Known Issues
- atvc: Vector template compile error with gcc 12 (workaround: gcc 11)
"""
def __init__(self, repos: list, from_tag: str, to_tag: str):
self.repos = repos
self.from_tag = from_tag
self.to_tag = to_tag
def generate(self, output_path: str):
categories = defaultdict(list)
for repo in self.repos:
commits = self._get_commits(repo)
for commit in commits:
category, entry = self._parse_commit(repo, commit)
if category and entry:
categories[category].append(entry)
# 生成 Markdown
lines = [
f"# CANN {self.to_tag} Release Notes",
f"",
f"## Summary",
f"- {sum(len(v) for v in categories.values())} changes across "
f"{len(self.repos)} repositories",
f"",
]
category_order = [
("breaking", "## Breaking Changes"),
("feature", "## New Features"),
("performance", "## Performance Improvements"),
("fix", "## Bug Fixes"),
("known_issue", "## Known Issues"),
]
for key, header in category_order:
if key in categories:
lines.append(header)
for entry in categories[key]:
lines.append(f"- {entry}")
lines.append("")
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
def _get_commits(self, repo: str) -> list:
"""获取仓库的提交记录"""
try:
result = subprocess.run(
["git", "log", f"{self.from_tag}..{self.to_tag}",
"--oneline", "--no-merges"],
cwd=f"../{repo}", capture_output=True, text=True
)
return [
line.strip() for line in result.stdout.split('\n')
if line.strip()
]
except Exception:
return []
def _parse_commit(self, repo: str, commit: str) -> tuple:
"""解析一条提交: 返回 (类别, Markdown条目)"""
import re
# 匹配 PR 编号: CANN#1234
pr_match = re.search(r'CANN#(\d+)', commit)
# 分类
if "BREAKING CHANGE" in commit.upper():
return ("breaking", f"{repo}: {commit}")
elif commit.startswith(("feat", "feat!")):
return ("feature", f"{repo}: {commit}")
elif commit.startswith(("perf",)):
return ("performance", f"{repo}: {commit}")
elif commit.startswith(("fix",)):
return ("fix", f"{repo}: {commit}")
elif "known" in commit.lower():
return ("known_issue", f"{repo}: {commit}")
return (None, None)
# 命令行入口
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--from-tag", required=True)
parser.add_argument("--to-tag", required=True)
parser.add_argument("--repos", required=True)
parser.add_argument("--output", required=True)
args = parser.parse_args()
gen = ChangelogGenerator(
args.repos.split(','), args.from_tag, args.to_tag
)
gen.generate(args.output)
踩坑:版本兼容性检查的顺序依赖——先检查底层仓库
# ❌ 按字母序检查: atvc → ge → hccl → ops-math → opbase → ...
# 检查 atvc 时 opbase 版本还不确定 → 报假错误
#
# ✅ 拓扑排序: 先检查无依赖的底层仓库,再检查上层
def topological_order(constraints):
"""Kahn's algorithm: 按依赖图的拓扑序检查"""
in_degree = defaultdict(int)
graph = defaultdict(list)
all_repos = set()
for c in constraints:
graph[c.dep_repo].append(c.consumer)
in_degree[c.consumer] += 1
all_repos.add(c.consumer)
all_repos.add(c.dep_repo)
# BFS: 从入度为 0 的节点开始
queue = [r for r in all_repos if in_degree[r] == 0]
order = []
while queue:
repo = queue.pop(0)
order.append(repo)
for downstream in graph[repo]:
in_degree[downstream] -= 1
if in_degree[downstream] == 0:
queue.append(downstream)
return order
踩坑:版本 bump 的"空 Release"问题——所有仓库都没变化时强制跳版本
# ❌ 所有仓库都没有 feat/fix/perf → bump_type="none" → 版本号不变
# 但用户期望看到一个新的 Release (v8.0.3 → v8.0.4)
#
# ✅ 如果所有 bump 都是 "none":
# 1. 检查是否是预定的周期性 Release (如月度 release)
# 2. 如果是 → bump BUILD (+1) → v8.0.3.b001 → v8.0.3.b002
# 3. 如果不是 → 提示用户是否真的要发空版本
if max_bump == "none":
if is_scheduled_release:
new_build = current_version.build + 1
new_version = Version(current.major, current.minor, current.patch, new_build)
print(f"Scheduled release: bumping BUILD → {new_version}")
else:
print("WARNING: No changes detected. Are you sure you want to release?")
if not confirm():
sys.exit(0)
release-management 的自动化 Release 流程:版本号基于 Conventional Commits 自动 bump(feat→minor、feat!→major、fix/perf→patch、空→build)→兼容性矩阵检查 55 仓依赖拓扑(底层先审,拓扑排序 BFS)→7 道 CI 质量门禁(版本校验→单元测试→跨仓兼容性→性能回归>5% 失败→精度验证 ε<1e-5→安全扫描→自动 tag+changelog)。踩坑:按字母序检查导致上层仓库先于依赖被审→拓扑排序先底层后上层、空 Release 版本号不变→周期 Release 自动 bump BUILD、手动 Release 弹确认。
鲲鹏昇腾开发者社区是面向全社会开放的“联接全球计算开发者,聚合华为+生态”的社区,内容涵盖鲲鹏、昇腾资源,帮助开发者快速获取所需的知识、经验、软件、工具、算力,支撑开发者易学、好用、成功,成为核心开发者。
更多推荐



所有评论(0)