华为Ascend C算子开发能力认证(中级)
实现一个sigmoid算子,编译,打包,安装。然后用给的test脚本校验答案。
要求
实现一个sigmoid算子,编译,打包,安装。然后用给的test脚本校验答案
基本流程
1.配置
先运行
init_env.sh //下载包 配置环境变量
source ~/.bashrc //bash的配置
source /home/ma-user/Ascend/ascend-toolkit/set_env.sh //环境变量
chmod +x -R * //运行脚本权限
修改,补全这四个文件
SigmoidCustom/SigmoidCustom/CMakePresets.json
SigmoidCustom/SigmoidCustom/op_host/sigmoid_custom_tiling.h
SigmoidCustom/SigmoidCustom/op_host/sigmoid_custom.cpp
SigmoidCustom/SigmoidCustom/op_kernel/sigmoid_custom.cpp
然后运行编译
bash build.sh
得到build_out下的安装包
运行这个安装包
cd build_out
./custom_opp_ubuntu_aarch64.run
最后去AclNNInvocation运行检验脚本
bash run.sh
检验脚本会调起py脚本,分别用numpy和我们写的算子计算答案,然后对比误差
这里既然把脚本也给我们了,其实可以输出中间变量来对比结果,从而得到比是否通过更多的信息
可以看到这里就是numpy硬算的
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Copyright 2022-2023 Huawei Technologies Co., Ltd
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# return 1 + np.exp(-x)
def gen_golden_data_simple():
input_x = np.random.uniform(-3, 3, [8, 2048]).astype(np.float16)
golden = sigmoid(input_x)
input_x.tofile("./input/input_x.bin")
golden.tofile("./output/golden.bin")
if __name__ == "__main__":
gen_golden_data_simple()
对比脚本,如果误差过大,可以输出一下标准答案和我们的答案
import os
import sys
import numpy as np
loss = 1e-3 # 容忍偏差,一般fp16要求绝对误差和相对误差均不超过千分之一
minimum = 10e-10
def verify_result(real_result, golden):
real_result = np.fromfile(real_result, dtype=np.float16) # 从bin文件读取实际运算结果
golden = np.fromfile(golden, dtype=np.float16) # 从bin文件读取预期运算结果
result = np.abs(real_result - golden) # 计算运算结果和预期结果偏差
deno = np.maximum(np.abs(real_result), np.abs(golden)) # 获取最大值并组成新数组
result_atol = np.less_equal(result, loss) # 计算绝对误差
result_rtol = np.less_equal(result / np.add(deno, minimum), loss) # 计算相对误差
if not result_rtol.all() and not result_atol.all():
if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss: # 误差超出预期时返回打印错误,返回对比失败
print(real_result[0])
print(golden[0])
print("[ERROR] result error")
return False
print("test pass")
return True
if __name__ == '__main__':
verify_result(sys.argv[1],sys.argv[2])
实现思路
op_host里的tiling部分其实没啥好讲的,大部分算子在这里都一样,可以直接cv官方仓库
tilenum和totallength分别是分段数和每一段的长度,我们在host侧计算,通过结构体传到device侧,device利用这些数据进行分段
#include "register/tilingdata_base.h"
namespace optiling {
BEGIN_TILING_DATA_DEF(SigmoidCustomTilingData)
TILING_DATA_FIELD_DEF(uint32_t, totalLength);
TILING_DATA_FIELD_DEF(uint32_t, tileNum);
END_TILING_DATA_DEF;
REGISTER_TILING_DATA_CLASS(SigmoidCustom, SigmoidCustomTilingData)
}
这里需要实现的只有计算tiling大小的函数,没有特别的地方,照抄即可
后面两个部分分别是输出shape的推导,和算子输入输出格式的声明,已经写好了。
#include "sigmoid_custom_tiling.h"
#include "register/op_def_registry.h"
namespace optiling {
const uint32_t BLOCK_DIM = 8;
const uint32_t TILE_NUM = 8;
static ge::graphStatus TilingFunc(gert::TilingContext* context)
{
SigmoidCustomTilingData tiling;
uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
context->SetBlockDim(BLOCK_DIM);
tiling.set_totalLength(totalLength);
tiling.set_tileNum(TILE_NUM);
tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
size_t *currentWorkspace = context->GetWorkspaceSizes(1);
currentWorkspace[0] = 0;
return ge::GRAPH_SUCCESS;
}
}
namespace ge {
static ge::graphStatus InferShape(gert::InferShapeContext* context)
{
const gert::Shape* x1_shape = context->GetInputShape(0);
gert::Shape* y_shape = context->GetOutputShape(0);
*y_shape = *x1_shape;
return GRAPH_SUCCESS;
}
static ge::graphStatus InferDataType(gert::InferDataTypeContext *context)
{
const auto inputDataType = context->GetInputDataType(0);
context->SetOutputDataType(0, inputDataType);
return ge::GRAPH_SUCCESS;
}
}
namespace ops {
class SigmoidCustom : public OpDef {
public:
explicit SigmoidCustom(const char* name) : OpDef(name)
{
this->Input("x")
.ParamType(REQUIRED)
.DataType({ge::DT_FLOAT16})
.Format({ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND});
this->Output("y")
.ParamType(REQUIRED)
.DataType({ge::DT_FLOAT16})
.Format({ge::FORMAT_ND})
.UnknownShapeFormat({ge::FORMAT_ND});
this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType);
this->AICore()
.SetTiling(optiling::TilingFunc);
this->AICore().AddConfig("ascend910b")
.AddConfig("ascend310b");
}
};
OP_ADD(SigmoidCustom);
}
接下来时op_kernel,也就是device侧运行的代码,比较关键
首先看结构:定义一个类,一个函数,函数的工作只有,解绑结构体,得到传递的tiling参数,init方法初始化计算类的对象,把参数传给对象,调用process方法来计算。
类里init函数根据tiling参数,取到当前这个ai_core的数据,并申请所需的队列,缓冲区内存。这里是一份代码,运行在多个核心上,想要每个核心处理不同部分的数据,需要根据当前核心的编号,计算当前核心对应的数据范围,也就是这里的
xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * GetBlockIdx(),
this->blockLength);
yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * GetBlockIdx(),
this->blockLength);
大体思路就是开头地址+每个核心的数据长度*当前核心编号
process函数根据tiling参数,分多次进行计算,每次计算调用copyin,compute,copyout,分别是把gm全局内存里的数据拷贝进lm本地内存,用本地内存的数据计算答案,把本地内存的答案拷贝回全局内存。
copyin的具体实现,是把输入数据放进一个队列里,这里使用队列是为了让copyin,compute,copyout并行执行,但他们之间又有数据依赖,需要用队列的方式来同步。
compute就取出输入队列里的元素,根据sigmoid的定义计算,但时直接用ascnedC提供的库来算倒数的话,有精度问题,这里采用牛顿迭代法。最后把结果放入输出队列。另外这里计算过程中最好每次操作的中间变量单开一个缓冲区存,有的接口不支持输入输出都用同一个向量
算子的具体用法可以看官方文档
实际上用到的主要只有两类:两个向量作为输入,比如Add,把两个向量相加,只要指定两个输入向量,一个输出向量,和长度即可;另外就是一个向量一个标量作为输入,比如Adds,也就是给一个向量每个位置加上标量,需要指定一个输入向量,一个输入标量,一个输出向量,长度
copyout从输出队列取元素,复制回gm
#include "kernel_operator.h"
using namespace AscendC;
constexpr int32_t BUFFER_NUM = 2;
class KernelSigmoid {
public:
__aicore__ inline KernelSigmoid() {}
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, uint32_t totalLength, uint32_t tileNum)
{
ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
this->blockLength = totalLength / GetBlockNum();
this->tileNum = tileNum;
ASSERT(tileNum != 0 && "tile num can not be zero!");
this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * GetBlockIdx(),
this->blockLength);
yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * GetBlockIdx(),
this->blockLength);
pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half));
pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(half));
pipe.InitBuffer(tmpBuffer1, this->tileLength * sizeof(half));
pipe.InitBuffer(tmpBuffer2, this->tileLength * sizeof(half));
pipe.InitBuffer(tmpBuffer3, this->tileLength * sizeof(half));
pipe.InitBuffer(tmpBuffer4, this->tileLength * sizeof(half));
}
__aicore__ inline void Process()
{
int32_t loopCount = this->blockLength / this->tileLength;
for (int32_t i = 0; i < loopCount; i++) {
CopyIn(i);
Compute(i);
CopyOut(i);
}
}
private:
__aicore__ inline void HighPrecisionReciprocal(LocalTensor<half>& dst,
const LocalTensor<half>& src,
int32_t length,
int iterations = 2)
{
LocalTensor<half> tmp = tmpBuffer4.Get<half>();
half two = 2.0h,negone=-1.0h;
// 初始近似值 (可以使用硬件Reciprocal的初始值)
AscendC::Reciprocal(dst, src, length);
// 牛顿迭代: x_{n+1} = x_n * (2 - a * x_n)
for (int i = 0; i < iterations; ++i) {
AscendC::Mul(tmp, src, dst, length); // tmp = a * x_n
AscendC::Muls(tmp, tmp, negone, length); // tmp = 2 - a * x_n
AscendC::Adds(tmp, tmp, two, length);
AscendC::Mul(dst, dst, tmp, length); // x_{n+1} = x_n * tmp
}
}
__aicore__ inline void CopyIn(int32_t progress)
{
LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
inQueueX.EnQue(xLocal);
}
__aicore__ inline void Compute(int32_t progress)
{
LocalTensor<half> xLocal = inQueueX.DeQue<half>();
LocalTensor<half> yLocal = outQueueY.AllocTensor<half>();
LocalTensor<half> tmp1 = tmpBuffer1.Get<half>();
LocalTensor<half> tmp2 = tmpBuffer2.Get<half>();
LocalTensor<half> tmp3 = tmpBuffer3.Get<half>();
half one=1.0,negone=-1.0;
AscendC::Muls(tmp1, xLocal, negone, this->tileLength);
AscendC::Exp(tmp2, tmp1, this->tileLength);
AscendC::Adds(tmp3, tmp2, one, this->tileLength);
// AscendC::Reciprocal(yLocal, tmp3, this->tileLength);
HighPrecisionReciprocal(yLocal,tmp3,this->tileLength,2);
outQueueY.EnQue<half>(yLocal);
inQueueX.FreeTensor(xLocal);
}
__aicore__ inline void CopyOut(int32_t progress)
{
LocalTensor<half> yLocal = outQueueY.DeQue<half>();
AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
outQueueY.FreeTensor(yLocal);
}
private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY;
TBuf<QuePosition::VECCALC> tmpBuffer1, tmpBuffer2, tmpBuffer3, tmpBuffer4;
GlobalTensor<half> xGm;
GlobalTensor<half> yGm;
uint32_t blockLength;
uint32_t tileNum;
uint32_t tileLength;
};
extern "C" __global__ __aicore__ void sigmoid_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) {
GET_TILING_DATA(tiling_data, tiling);
KernelSigmoid op;
op.Init(x, y, tiling_data.totalLength, tiling_data.tileNum);
op.Process();
}
鲲鹏昇腾开发者社区是面向全社会开放的“联接全球计算开发者,聚合华为+生态”的社区,内容涵盖鲲鹏、昇腾资源,帮助开发者快速获取所需的知识、经验、软件、工具、算力,支撑开发者易学、好用、成功,成为核心开发者。
更多推荐

所有评论(0)