深入探究Python：字符串子串

2024-03-24 09:14:10

a='dddddddddd'
a=a[1:6]

              0 LOAD_CONST               0 ('dddddddddd')
              2 STORE_NAME               0 (a)

              4 LOAD_NAME                0 (a)
              6 LOAD_CONST               1 (1)
              8 LOAD_CONST               2 (6)
             10 BUILD_SLICE              2
             12 BINARY_SUBSCR
             14 STORE_NAME               0 (a)
             16 LOAD_CONST               3 (None)
             18 RETURN_VALUE

【BUILD_SLICE】    
            PyObject *start, *stop, *step, *slice;
            if (oparg == 3)
                step = POP();
            else
                step = NULL;
            stop = POP();
            start = TOP();
            slice = PySlice_New(start, stop, step);
            Py_DECREF(start);
            Py_DECREF(stop);
            Py_XDECREF(step);
            SET_TOP(slice);
            if (slice == NULL)
                goto error;
            DISPATCH();

 【BINARY_SUBSCR】
            PyObject *sub = POP();
            PyObject *container = TOP();
            PyObject *res = PyObject_GetItem(container, sub);
            Py_DECREF(container);
            Py_DECREF(sub);
            SET_TOP(res);
            if (res == NULL)
                goto error;
            DISPATCH();

//有删减
PyObject *
PyObject_GetItem(PyObject *o, PyObject *key)
{
    PyMappingMethods *m;
    PySequenceMethods *ms;
    
    //字符串用此方式
    //mapping.mp_subscript
    m = Py_TYPE(o)->tp_as_mapping;
    if (m && m->mp_subscript) {
        PyObject *item = m->mp_subscript(o, key);//调用函数->unicode_subscript
        assert((item != NULL) ^ (PyErr_Occurred() != NULL));
        return item;
    }
    
    //sequence.sq_item
    ms = Py_TYPE(o)->tp_as_sequence;
    if (ms && ms->sq_item) {
        '''
    }

    if (PyType_Check(o)) {
        '''
    }

    return type_error("'%.200s' object is not subscriptable", o);
}

//有删减
static PyObject*
unicode_subscript(PyObject* self, PyObject* item)
{
    
    //单个索引
    if (PyIndex_Check(item)) {
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
        if (i == -1 && PyErr_Occurred())
            return NULL;
        if (i < 0)
            i += PyUnicode_GET_LENGTH(self);
        return unicode_getitem(self, i);
    //Slice类型 
    }else if (PySlice_Check(item)) {
        Py_ssize_t start, stop, step, slicelength, i;
        size_t cur;
        PyObject *result;
        void *src_data, *dest_data;
        int src_kind, dest_kind;
        Py_UCS4 ch, max_char, kind_limit;
        
        //获取对应值
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
            return NULL;
        }
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
                                            &start, &stop, step);
        
        //空串情况
        if (slicelength <= 0) {
            _Py_RETURN_UNICODE_EMPTY();
        //不操作情况
        } else if (start == 0 && step == 1 &&
                   slicelength == PyUnicode_GET_LENGTH(self)) {
            return unicode_result_unchanged(self);
        //step为1情况
        } else if (step == 1) {
            return PyUnicode_Substring(self,
                                       start, start + slicelength);
        }
        //通常情况
        src_kind = PyUnicode_KIND(self);
        src_data = PyUnicode_DATA(self);
        if (!PyUnicode_IS_ASCII(self)) {
            kind_limit = kind_maxchar_limit(src_kind);
            max_char = 0;
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
                ch = PyUnicode_READ(src_kind, src_data, cur);
                if (ch > max_char) {
                    max_char = ch;
                    if (max_char >= kind_limit)
                        break;
                }
            }
        }
        else
            max_char = 127;
        result = PyUnicode_New(slicelength, max_char);
        if (result == NULL)
            return NULL;
        dest_kind = PyUnicode_KIND(result);
        dest_data = PyUnicode_DATA(result);

        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
        }
        assert(_PyUnicode_CheckConsistency(result, 1));
        return result;
    //其他类型 
    } else {
        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
        return NULL;
    }
}

//索引情况
//有删减
static PyObject *
unicode_getitem(PyObject *self, Py_ssize_t index)
{
    void *data;
    enum PyUnicode_Kind kind;
    Py_UCS4 ch;
    
    '''
    
    kind = PyUnicode_KIND(self);//类型
    data = PyUnicode_DATA(self);//buffer
    ch = PyUnicode_READ(kind, data, index);
    return unicode_char(ch);
}

#define PyUnicode_READ(kind, data, index) \
    ((Py_UCS4) \
    ((kind) == PyUnicode_1BYTE_KIND ? \
        ((const Py_UCS1 *)(data))[(index)] : \ //UCS1
        ((kind) == PyUnicode_2BYTE_KIND ? \
            ((const Py_UCS2 *)(data))[(index)] : \ //UCS2
            ((const Py_UCS4 *)(data))[(index)] \  //UCS4
        ) \
    ))

//step为1情况
//有删减
PyObject*
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
{
    unsigned char *data;
    int kind;
    Py_ssize_t length;

    length = PyUnicode_GET_LENGTH(self);
    end = Py_MIN(end, length);
    
    //不操作
    if (start == 0 && end == length)
        return unicode_result_unchanged(self);
    
    //非法范围
    if (start < 0 || end < 0) {
        PyErr_SetString(PyExc_IndexError, "string index out of range");
        return NULL;
    }
    //非法范围
    if (start >= length || end < start)
        _Py_RETURN_UNICODE_EMPTY();

    length = end - start;
    //ascii处理
    if (PyUnicode_IS_ASCII(self)) {
        data = PyUnicode_1BYTE_DATA(self);
        return _PyUnicode_FromASCII((char*)(data + start), length);
    }
    //其他编码处理
    else {
        kind = PyUnicode_KIND(self);
        data = PyUnicode_1BYTE_DATA(self);
        return PyUnicode_FromKindAndData(kind,
                                         data + kind * start,
                                         length);
    }
}

//ascii类型处理
PyObject*
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
{
    const unsigned char *s = (const unsigned char *)buffer;
    PyObject *unicode;
    //单个字符
    if (size == 1) {
        return get_latin1_char(s[0]);
    }
    //创建新对象拷贝
    unicode = PyUnicode_New(size, 127);
    if (!unicode)
        return NULL;
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
    assert(_PyUnicode_CheckConsistency(unicode, 1));
    return unicode;
}

//unicode类型的处理
PyObject*
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
{
    switch (kind) {
    case PyUnicode_1BYTE_KIND:
        return _PyUnicode_FromUCS1(buffer, size);
    case PyUnicode_2BYTE_KIND:
        return _PyUnicode_FromUCS2(buffer, size);
    case PyUnicode_4BYTE_KIND:
        return _PyUnicode_FromUCS4(buffer, size);
    default:
        PyErr_SetString(PyExc_SystemError, "invalid kind");
        return NULL;
    }
}

//US1
static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
{
    PyObject *res;
    unsigned char max_char;
    
    //空串
    if (size == 0)
        _Py_RETURN_UNICODE_EMPTY();
    assert(size > 0);
    //单个字符
    if (size == 1)
        return get_latin1_char(u[0]);
    //编码最大值
    max_char = ucs1lib_find_max_char(u, u + size);
    res = PyUnicode_New(size, max_char);
    #拷贝到新对象
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);

    return res;
}

//US2
static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
{
    PyObject *res;
    Py_UCS2 max_char;
    
    //空串
    if (size == 0)
        _Py_RETURN_UNICODE_EMPTY();
    assert(size > 0);
    
    //单个字符
    if (size == 1)
        return unicode_char(u[0]);
    
    //编码最大值
    max_char = ucs2lib_find_max_char(u, u + size);
    res = PyUnicode_New(size, max_char);
    
    //一般情况
    if (max_char >= 256)
        #拷贝到新对象
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
    //编码最大值过小转换编码
    else {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
    }

    return res;
}

//US4
static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
{
    PyObject *res;
    Py_UCS4 max_char;
    
    //空串
    if (size == 0)
        _Py_RETURN_UNICODE_EMPTY();
    
    //单个字符
    if (size == 1)
        return unicode_char(u[0]);
    
    //编码最大值
    max_char = ucs4lib_find_max_char(u, u + size);
    res = PyUnicode_New(size, max_char);
    
    //编码最大值过小转换编码
    if (max_char < 256)
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
                                 PyUnicode_1BYTE_DATA(res));
    //编码最大值过小转换编码
    else if (max_char < 0x10000)
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
                                 PyUnicode_2BYTE_DATA(res));
    //一般情况
    else
        #拷贝到新对象
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);

    return res;
}

码农公寓

相关文章