本文作为上一篇文章(TVM开发代码学习)给Relay系统添加新算子 - 知乎 (zhihu.com)的补充,主要是从6个部分熟悉Relay算子代码,从添加一个算子的角度去解构TVM中的Relay算子。
回顾上一节,TVM中给Relay增加算子需要有以下几点:
-
在src文件里的的定义一个继承自AttrsNode的结构体,包含算子的一般属性参数。 include/tvm/relay/attrs/
-
src文件夹里的type relation函数 src/relay/op/
-
注册算子的属性信息src/relay/op/
-
python端的计算定义python/tvm/topi/
-
python端的计算+调度打包函数python/tvm/relay/op/strategy/
-
在src文件夹里创建该算子的CallNode实例,并且注册。 src/relay/op/
-
python端的简洁API,实现最后的功能。python/tvm/relay/op
注意,在下面,因为步骤3和6都是在进行某些类的注册,所以我会把这两步骤并在一起(说实话,我并不是很理解官方指引里把这分成两个步骤的意思,它们一般都在一个cc文件里,实现的功能也类似,也许是为了叙述逻辑上通顺?)。6是针对CallNode类的注册,创建一个算子的CallNode实例即可全局注册,而3是针对步骤1创建的AttrsNode和步骤2创建的type relation的注册,
所以TVM中Relay算子代码分为以下六个部分:
1、一个继承自AttrsNode的结构体,包含算子的一般属性参数。 include/tvm/relay/attrs/
2、src文件夹里的type relation函数 src/relay/op/
3、注册算子的属性信息AttrsNode、类型关系以及创建CallNode并注册。src/relay/op/
4、python端的计算定义python/tvm/topi/
5、python端的计算+调度打包函数python/tvm/relay/op/strategy/
6、python端的简洁API,实现最后的功能。python/tvm/relay/op
随手画个图,更清晰一点:
为了更加熟悉其流程,今天挑Softmax算子和Conv1d算子的代码看看。
Sofrmax算子:
1.AttrsNode
/*! \brief Attributes used in softmax operators */
struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
int axis;
TVM_DECLARE_ATTRS(SoftmaxAttrs, "relay.attrs.SoftmaxAttrs") {
TVM_ATTR_FIELD(axis).set_default(-1).describe("The axis to sum over when computing softmax.");
}
};
源代码位于include\tvm\relay\attrs\nn.h
Softmax算子的属性只需要一个axis就够了。
2.type relation函数
softmax的type relation函数非常简单,毕竟算子本身就简单。它没有定义一个属于softmax的type relation,而是使用了IdentityRel(),和众多算子(relu、fast_softmax、l2_normalize等)公用这个类型关联函数。这些算子的输入输出类型、形状都相同。
源码位于\src\relay\op\type_relations.cc
bool IdentityRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
const TypeReporter& reporter) {
for (size_t i = 1; i < types.size(); ++i) {
reporter->Assign(types[i], types[0]);
}
return true;
}
3.AttrsNode注册、Type Relation注册、CallNode注册
针对AttrsNode的注册TVM_REGISTER_NODE_TYPE
针对算子type relation的注册RELAY_REGISTER_OP
源码位于\src\relay\op\nn\nn.cc
TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
RELAY_REGISTER_OP("nn.softmax")
.describe(R"code(Softmax layer.
.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
.. note::
This operator can be optimized away for inference.
- **data**: The input data
)code" TVM_ADD_FILELINE)
.set_attrs_type<SoftmaxAttrs>()
.set_num_inputs(1)
.add_argument("data", "Tensor", "The input tensor.")
.set_support_level(1)
.add_type_rel("Identity", IdentityRel);
然后,创建一个CallNode对象,并使用注册宏TVM_REGISTER_GLOBAL注册。
TVM_REGISTER_GLOBAL("relay.op.nn._make.softmax").set_body_typed([](Expr data, int axis) {
auto attrs = make_object<SoftmaxAttrs>();
attrs->axis = axis;
static const Op& op = Op::Get("nn.softmax");
return Call(op, {data}, Attrs(attrs), {});
});
不同于上一节还写了个MakeCumsum函数返回CallNode,这里因为softmax简单,他直接将callnode创建和宏注册放在了一起。
4.python端的topi库中定义具体计算
源码在\python\tvm\topi\nn\softmax.py,这或许是整个TVM算子定义里最难的一步,涉及了其张量表达式的设计。
def softmax_common(x, axis, use_fast_exp):
"""The common part of softmax and fast_softmax"""
shape = x.shape
if axis < 0:
axis = len(shape) + axis
if axis >= len(shape):
ValueError("axis parameter should be less than input dim")
k1 = te.reduce_axis((0, shape[axis]), name="k")
k2 = te.reduce_axis((0, shape[axis]), name="k")
def insert_reduce_index(indices, reduce_index):
return indices[:axis] + (reduce_index,) + indices[axis:]
def get_non_reduce_indices(indices):
return tuple([var for (i, var) in enumerate(indices) if i != axis])
def _compute_max(*indices):
eval_range = insert_reduce_index(indices, k1)
return tvm.te.max(x[eval_range], axis=k1)
def _compute_delta(max_elem, *indices):
non_reduce_indices = get_non_reduce_indices(indices)
return x[indices] - max_elem[non_reduce_indices]
def _compute_exp(max_elem, *indices):
non_reduce_indices = get_non_reduce_indices(indices)
return te.exp(x[indices] - max_elem[non_reduce_indices])
def _compute_expsum(exp, *indices):
eval_range = insert_reduce_index(indices, k2)
return te.sum(exp[eval_range], axis=k2)
def _normalize(exp, expsum, *indices):
non_reduce_indices = get_non_reduce_indices(indices)
return exp[indices] / expsum[non_reduce_indices]
reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis])
max_elem = te.compute(reduced_shape, _compute_max, name="T_softmax_maxelem")
if use_fast_exp:
delta = te.compute(
shape, lambda *indices: _compute_delta(max_elem, *indices), name="T_softmax_delta"
)
exp = topi.math.fast_exp(delta)
else:
exp = te.compute(
shape, lambda *indices: _compute_exp(max_elem, *indices), name="T_softmax_exp"
)
expsum = te.compute(
reduced_shape, lambda *indices: _compute_expsum(exp, *indices), name="T_softmax_expsum"
)
return te.compute(
shape,
lambda *indices: _normalize(exp, expsum, *indices),
name="T_softmax_norm",
attrs={"axis": axis},
)
5.打包计算与调度
源代码在\python\tvm\relay\op\strategy\generic.py,可以看到其流程和上节介绍的基本一致
def wrap_compute_softmax(topi_compute):
"""Wrap softmax topi compute"""
def _compute_softmax(attrs, inputs, out_type):
axis = attrs.get_int("axis")
return [topi_compute(inputs[0], axis)]
return _compute_softmax
@override_native_generic_func("softmax_strategy")
def softmax_strategy(attrs, inputs, out_type, target):
"""softmax generic strategy"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_softmax(topi.nn.softmax),
wrap_topi_schedule(topi.generic.schedule_softmax),
name="softmax.generic",
)
return strategy
6.最后的打包
\python\tvm\relay\op\nn\nn.py
def softmax(data, axis=-1):
r"""Computes softmax.
.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
.. note::
This operator can be optimized away for inference.
Parameters
----------
data: tvm.relay.Expr
The input data to the operator.
axis: int, optional
The axis to sum over when computing softmax
Returns
-------
result : tvm.relay.Expr
The computed result.
"""
return _make.softmax(data, axis)
一维卷积算子Conv1d
1.AttrsNode
/*! \brief Attributes used in 1D convolution operators */
struct Conv1DAttrs : public tvm::AttrsNode<Conv1DAttrs> {
Array<IndexExpr> strides;
Array<IndexExpr> padding;
Array<IndexExpr> dilation;
int groups;
IndexExpr channels;
Array<IndexExpr> kernel_size;
std::string data_layout;
std::string kernel_layout;
std::string out_layout;
DataType out_dtype;
TVM_DECLARE_ATTRS(Conv1DAttrs, "relay.attrs.Conv1DAttrs") {
TVM_ATTR_FIELD(strides)
.set_default(Array<IndexExpr>({
1,
}))
.describe("Specifies the stride of the convolution.");
TVM_ATTR_FIELD(padding)
.set_default(Array<IndexExpr>({0, 0}))
.describe(
"If padding is non-zero, then the input is implicitly zero-padded"
"on both sides for padding number of points");
TVM_ATTR_FIELD(dilation)
.set_default(Array<IndexExpr>({
1,
}))
.describe("Specifies the dilation rate to use for dilated convolution.");
TVM_ATTR_FIELD(groups).set_default(1).describe(
"Currently unused but may be added in the future.");
TVM_ATTR_FIELD(channels)
.describe(
"The number of output channels in the convolution."
" If it is not set, inferred by shape of the weight.")
.set_default(NullValue<IndexExpr>());
TVM_ATTR_FIELD(kernel_size)
.describe("Specifies the dimensions of the convolution window.")
.set_default(NullValue<Array<IndexExpr>>());
TVM_ATTR_FIELD(data_layout)
.set_default("NCW")
.describe(
"Dimension ordering of input data. Can be 'NCW', 'NWC', etc."
"'N', 'C', 'W' stands for batch, channel, and width"
"dimensions respectively. Convolution is applied on the 'W'"
"dimension.");
TVM_ATTR_FIELD(kernel_layout)
.set_default("OIW")
.describe(
"Dimension ordering of weight. Can be 'OIW', or 'WIO', etc."
"'O', 'I', 'W' stands for num_filter, input_channel, and width"
"dimensions respectively.");
// use 0 bits to indicate none.
TVM_ATTR_FIELD(out_dtype)
.set_default(NullValue<DataType>())
.describe("Output data type, set to explicit type under mixed precision setting");
}
};
2.type relation函数
源代码在\src\relay\op\nn\convolution.h,该文件里包含着TVM所有卷积算子的类型联系函数。这个conv1d的类型函数完成了输入输出数据格式的检查、通过reporter实现输出类型的约束的功能。
// Standard convolution operator shape relations
template <typename AttrType>
bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
const TypeReporter& reporter) {
ICHECK_EQ(types.size(), 3);
const auto* data = types[0].as<TensorTypeNode>();
const auto* weight = types[1].as<TensorTypeNode>();
if (data == nullptr) return false;
static const Layout kNCW("NCW");
static const Layout kOIW("OIW");
const AttrType* param = attrs.as<AttrType>();
ICHECK(param != nullptr);
const Layout in_layout(param->data_layout);
const Layout kernel_layout(param->kernel_layout);
const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
ICHECK(trans_in_layout.defined())
<< "Conv only support input layouts that are convertible from NCW."
<< " But got " << in_layout;
const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
ICHECK(trans_kernel_layout.defined())
<< "Conv only support kernel layouts that are convertible from OIW."
<< " But got " << kernel_layout;
Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
ICHECK(trans_out_layout.defined())
<< "Conv only support output layouts that are convertible from NCW."
<< " But got " << out_layout;
Array<IndexExpr> dshape_ncw = trans_in_layout.ForwardShape(data->shape);
IndexExpr channels, dilated_ksize;
// infer weight if the kernel_size and channels are defined
if (param->kernel_size.defined() && param->channels.defined()) {
Array<IndexExpr> wshape;
wshape = {{param->channels, dshape_ncw[1], param->kernel_size[0]}};
wshape = trans_kernel_layout.BackwardShape(wshape);
channels = param->channels;
dilated_ksize = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
DataType weight_dtype = data->dtype;
if (weight != nullptr) {
weight_dtype = weight->dtype;
}
// assign result to reporter
reporter->Assign(types[1], TensorType(wshape, weight_dtype));
} else {
// use weight to infer the conv shape.
if (weight == nullptr) return false;
auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
if (param->kernel_size.defined()) {
// check the size
ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
<< "Conv1D: shape of weight is inconsistent with kernel_size, "
<< " kernel_size=" << param->kernel_size << " wshape=" << wshape;
}
if (param->channels.defined()) {
ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
<< "Conv1D: shape of weight is inconsistent with channels, "
<< " channels=" << param->channels << " wshape=" << wshape;
}
if (!dshape_ncw[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
ICHECK(reporter->AssertEQ(dshape_ncw[1], wshape[1]));
}
channels = wshape[0];
dilated_ksize = 1 + (wshape[2] - 1) * param->dilation[0];
}
// dilation
Array<IndexExpr> oshape({dshape_ncw[0], channels, 0});
if (!dshape_ncw[2].as<tir::AnyNode>()) {
oshape.Set(2, indexdiv(dshape_ncw[2] + param->padding[0] + param->padding[1] - dilated_ksize,
param->strides[0]) +
1);
} else {
oshape.Set(2, dshape_ncw[2]);
}
DataType out_dtype = param->out_dtype;
if (out_dtype.bits() == 0) {
out_dtype = data->dtype;
}
oshape = trans_out_layout.BackwardShape(oshape);
// assign output type
reporter->Assign(types[2], TensorType(oshape, out_dtype));
return true;
}
3.AttrsNode注册、Type Relation注册、CallNode注册
代码位于\src\relay\op\nn\convolution.cc
AttrsNode:
// relay.nn.conv1d
TVM_REGISTER_NODE_TYPE(Conv1DAttrs);
Type Relation与额外信息注册:
RELAY_REGISTER_OP("nn.conv1d")
.describe(R"code(1D convolution layer (e.g. spatial convolution over sequences).
This layer creates a convolution kernel that is convolved
with the layer input to produce a tensor of outputs.
- **data**: This depends on the `layout` parameter. Input is 3D array of shape
(batch_size, in_channels, width) if `layout` is `NCW`.
- **weight**: (channels, in_channels, kernel_size)
- **out**: This depends on the `layout` parameter. Output is 3D array of shape
(batch_size, channels, out_width) if `layout` is `NCW`.
)code" TVM_ADD_FILELINE)
.set_attrs_type<Conv1DAttrs>()
.set_num_inputs(2)
.add_argument("data", "Tensor", "The input tensor.")
.add_argument("weight", "Tensor", "The weight tensor.")
.set_support_level(2)
.add_type_rel("Conv1D", Conv1DRel<Conv1DAttrs>)
.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv1DAttrs>);
CallNode及其注册:
//建立CallNode,这个CallNode,1d2d3d卷积算子通用
template <typename T>
inline Expr MakeConv(Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
Array<IndexExpr> dilation, int groups, IndexExpr channels,
Array<IndexExpr> kernel_size, std::string data_layout,
std::string kernel_layout, std::string out_layout, DataType out_dtype,
std::string op_name) {
auto attrs = make_object<T>();
attrs->strides = std::move(strides);
attrs->padding = std::move(padding);
attrs->dilation = std::move(dilation);
attrs->groups = groups;
attrs->channels = std::move(channels);
attrs->kernel_size = std::move(kernel_size);
attrs->data_layout = std::move(data_layout);
attrs->kernel_layout = std::move(kernel_layout);
attrs->out_layout = std::move(out_layout);
attrs->out_dtype = std::move(out_dtype);
const Op& op = Op::Get(op_name);
return Call(op, {data, weight}, Attrs(attrs), {});
}
//注册
TVM_REGISTER_GLOBAL("relay.op.nn._make.conv1d")
.set_body_typed([](Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
Array<IndexExpr> dilation, int groups, IndexExpr channels,
Array<IndexExpr> kernel_size, String data_layout, String kernel_layout,
String out_layout, DataType out_dtype) {
return MakeConv<Conv1DAttrs>(data, weight, strides, padding, dilation, groups, channels,
kernel_size, data_layout, kernel_layout, out_layout, out_dtype,
"nn.conv1d");
});
4.TOPI库中计算的定义
源代码位于\python\tvm\topi\nn\conv1d.py中,里面实现了NWC和NCW两种数据格式的卷积,这里只展示NCW的。
TOPI中用tensor expression定义算子算是TVM算子代码里最难最抽象的,陈天奇老师称其是连接高级IR与低级IR间的桥梁。感觉网上包括TVM社区里关于TVM中的张量表达式没有简单清晰、内容完备的教程,以后会开坑分享内容。
def conv1d_ncw(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
if out_dtype is None:
out_dtype = data.dtype
if isinstance(strides, (tuple, list)):
strides = strides[0]
if isinstance(dilation, (tuple, list)):
dilation = dilation[0]
batch, in_channels, data_width = data.shape
out_channels, _, kernel_size = kernel.shape
# Compute the output shape
dilated_kernel_size = (kernel_size - 1) * dilation + 1
pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,))
out_channels = simplify(out_channels)
out_width = simplify((data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1)
# Apply padding
pad_before = [0, 0, pad_left]
pad_after = [0, 0, pad_right]
temp = pad(data, pad_before, pad_after, name="pad_temp")
# Compute graph
rc = te.reduce_axis((0, in_channels), name="rc")
rw = te.reduce_axis((0, kernel_size), name="rw")
return te.compute(
(batch, out_channels, out_width),
lambda b, c, w: te.sum(
temp[b, rc, w * strides + rw * dilation].astype(out_dtype)
* kernel[c, rc, rw].astype(out_dtype),
axis=[rc, rw],
),
tag="conv1d_ncw",
)
5.计算+调度的包装
同上篇文章,CPU的实现位于\python\tvm\relay\op\strategy\generic.py
暂时没有对比CPU与GPU实现方式的区别,等熟悉了CUDA编程和硬件加速的内容之后或许会写一篇分享。
# conv1d
def wrap_compute_conv1d(topi_compute):
"""wrap conv1d topi compute"""
def _compute_conv1d(attrs, inputs, out_type):
"""Compute definition of conv1d"""
strides = get_const_tuple(attrs.strides)
padding = get_const_tuple(attrs.padding)
dilation = get_const_tuple(attrs.dilation)
out_dtype = attrs.out_dtype
out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
return [topi_compute(inputs[0], inputs[1], strides, padding, dilation, out_dtype)]
return _compute_conv1d
@override_native_generic_func("conv1d_strategy")
def conv1d_strategy(attrs, inputs, out_type, target):
"""conv1d generic strategy"""
logger.warning("conv1d is not optimized for this platform.")
layout = attrs.data_layout
dilation = get_const_tuple(attrs.dilation)
if dilation[0] < 1:
raise ValueError("dilation should be a positive value")
strategy = _op.OpStrategy()
if layout == "NCW":
strategy.add_implementation(
wrap_compute_conv1d(topi.nn.conv1d_ncw),
wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
name="conv1d_ncw.generic",
)
elif layout == "NWC":
strategy.add_implementation(
wrap_compute_conv1d(topi.nn.conv1d_nwc),
wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
name="conv1d_nwc.generic",
)
else:
raise ValueError("Unsupported conv1d layout {}".format(layout))
return strategy
6、最后的API封装
def conv1d(
data,
weight,
strides=1,
padding=0,
dilation=1,
groups=1,
channels=None,
kernel_size=None,
data_layout="NCW",
kernel_layout="OIW",
out_layout="",
out_dtype="",
):
if isinstance(kernel_size, int):
kernel_size = (kernel_size,)
if isinstance(strides, int):
strides = (strides,)
if isinstance(dilation, int):
dilation = (dilation,)
padding = get_pad_tuple1d(padding)
return _make.conv1d(
data,
weight,
strides,
padding,
dilation,
groups,
channels,
kernel_size,
data_layout,
kernel_layout,
out_layout,
out_dtype,
)