4.1. 字符串对象
字符串对象是“变长对象”。
4.1.1. Python中的创建
Python中字符串(bytes)对象创建最重要的方法为PyBytes_FromStringAndSize,如下Python语句最终会调用到PyBytes_FromStringAndSize:
a = b'hello'
b = str(b'world')
4.1.2. PyBytes_FromStringAndSize的C调用栈
词法解析,最终调到PyBytes_FromStringAndSize,调用顺序如下:
// ast.c
ast_for_expr
=>ast_for_power
=>ast_for_atom_expr
=>ast_for_atom (case STRING)
=>parsestrplus
=>parsestr
// bytesobject.c
=>PyBytes_FromStringAndSize
4.1.3. PyBytes_FromStringAndSize源码
// bytesobject.c
PyObject *
PyBytes_FromStringAndSize(const char *str, Py_ssize_t size)
{
PyBytesObject *op;
if (size < 0) {
PyErr_SetString(PyExc_SystemError,
"Negative size passed to PyBytes_FromStringAndSize");
return NULL;
}
if (size == 1 && str != NULL &&
(op = characters[*str & UCHAR_MAX]) != NULL)
{
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
op = (PyBytesObject *)_PyBytes_FromSize(size, 0);
if (op == NULL)
return NULL;
if (str == NULL)
return (PyObject *) op;
memcpy(op->ob_sval, str, size);
/* share short strings */
if (size == 1) {
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
// bytesobject.c
static PyObject *
_PyBytes_FromSize(Py_ssize_t size, int use_calloc)
{
PyBytesObject *op;
assert(size >= 0);
if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if ((size_t)size > (size_t)PY_SSIZE_T_MAX - PyBytesObject_SIZE) {
PyErr_SetString(PyExc_OverflowError,
"byte string is too large");
return NULL;
}
/* Inline PyObject_NewVar */
if (use_calloc)
op = (PyBytesObject *)PyObject_Calloc(1, PyBytesObject_SIZE + size);
else
op = (PyBytesObject *)PyObject_Malloc(PyBytesObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyBytes_Type, size);
op->ob_shash = -1;
if (!use_calloc)
op->ob_sval[size] = '\0';
/* empty byte string singleton */
if (size == 0) {
nullstring = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
可以看到:
- 字符串对象的C数据结构:
// bytesobject.h
typedef struct {
PyObject_VAR_HEAD
Py_hash_t ob_shash;
char ob_sval[1];
} PyBytesObject;
- 空串缓存:空串(nullstring)为同一个地址,第二次需要空串时,只是将计数加1,在_PyBytes_FromSize中实现空串缓存;
- 字符缓冲池:字符(characters)为同一个地址,第二次需要该字符时,只是将计数加1,在PyBytes_FromStringAndSize中实现字符缓存;
4.2. 字符串对象的特性
支持tp_as_number、tp_as_sequence、tp_as_mapping这三种操作。
4.2.1. 数值操作
// bytesobject.c
&bytes_as_number, /* tp_as_number */
4.2.2. 序列操作
// bytesobject.c
&bytes_as_sequence, /* tp_as_sequence */
// bytesobject.c
static PySequenceMethods bytes_as_sequence = {
(lenfunc)bytes_length, /*sq_length*/
(binaryfunc)bytes_concat, /*sq_concat*/
(ssizeargfunc)bytes_repeat, /*sq_repeat*/
(ssizeargfunc)bytes_item, /*sq_item*/
0, /*sq_slice*/
0, /*sq_ass_item*/
0, /*sq_ass_slice*/
(objobjproc)bytes_contains /*sq_contains*/
};
因为没有实现PySequenceMethods中的设置方法,所以字符串不可变。
其中:
- bytes_length
len(b'hello')
- bytes_concat
b'hello' + b'world'
多个字符串相加效率低于join,join只分配一次内存;
- bytes_repeat
b'hello'*10
效率要高于同个字符串相加;
- bytes_item:暂时没有找到相应Python语句;
- bytes_contains
b'h' in b'hello'
4.2.3. 关联操作
// bytesobject.c
&bytes_as_mapping, /* tp_as_mapping */
// bytesobject.c
static PyMappingMethods bytes_as_mapping = {
(lenfunc)bytes_length,
(binaryfunc)bytes_subscript,
0,
};
其中:
- bytes_subscript
test = b'hello world'
test[1]
test[0:5]
test[1]会走bytes_subscript方法的index分支,test[0:5]会走slice分支;
4.2.4. to string
// bytesobject.c
(reprfunc)bytes_repr, /* tp_repr */
bytes_str, /* tp_str */
4.2.5. hash
// bytesobject.c
(hashfunc)bytes_hash, /* tp_hash */
4.2.6. 比较
// bytesobject.c
(richcmpfunc)bytes_richcompare, /* tp_richcompare */
4.2.7. 内置方法
// bytesobject.c
bytes_methods, /* tp_methods */
4.3 参考
- Python源码剖析