SQLServer-TEMPDB性能问题排查

SQLServer-TEMPDB性能问题排查

问题描述

实例卡慢-金融云用户-RT敏感

问题分析

现场

首先看下进程状态:

select * from sys.sysprocesses 
where spid>50 
and lastwaittype<>'MISCELLANEOUS' 
and status<>'sleeping' 
and spid<>@@SPID    

SQLServer-TEMPDB性能问题排查
500+的进程挂起,必然会卡慢,但这是结果我们看看能否找到原因;

可以注意到很多session都被block了,所以我们再看下阻塞链,sys.sysprocesses视图的blocked列表示该会话被哪个会话所阻塞:

USE master
go
SET NOCOUNT ON

DECLARE
    @time_lock int,
    @show_type int

SELECT
    @time_lock = 1,
    @show_type = 0    -- 0. block only   1. all process    2. block, if not block, show all process
    

-- ================================================
-- Get Lock spid
-- ================================================
DECLARE
    @level smallint,
    @rows int
SELECT
    @level = 0,
    @rows = 0

DECLARE @tb_block TABLE(
    ID int IDENTITY
        PRIMARY KEY,
    block_id smallint,
    spid smallint,
    blocked smallint,
    waittime bigint,
    level smallint,
    UNIQUE(
        spid, blocked, block_id)
        WITH(
            IGNORE_DUP_KEY = ON)
)
INSERT @tb_block(
    block_id, spid, blocked, waittime, level)
SELECT
    CASE
        WHEN blocked = spid OR blocked = 0 THEN spid
        ELSE 0 END, 
    spid, blocked, MAX(waittime), @level
FROM master.dbo.sysprocesses P WITH(NOLOCK)
WHERE spid > 50
    AND blocked > 0
GROUP BY spid, blocked

SELECT
    @rows = @rows + @@ROWCOUNT

IF @show_type = 1 OR (@show_type = 2 AND @rows = 0)
    INSERT @tb_block(
        block_id, spid, blocked, waittime, level)
    SELECT
        CASE
            WHEN blocked = spid OR blocked = 0 THEN spid
            ELSE 0 END, 
        spid, blocked, MAX(waittime), @level
    FROM master.dbo.sysprocesses P WITH(NOLOCK)
    WHERE spid > 50
    GROUP BY spid, blocked

INSERT @tb_block(
    block_id, spid, blocked, waittime, level)
SELECT
    CASE
        WHEN blocked = spid OR blocked = 0 THEN spid
        ELSE 0 END, 
    spid, blocked, MAX(waittime), @level
FROM master.dbo.sysprocesses P WITH(NOLOCK)
WHERE spid > 50
    AND spid IN(
            SELECT blocked FROM @tb_block)
GROUP BY spid, blocked

SELECT
    @rows = @rows + @@ROWCOUNT

WHILE @rows > 0
BEGIN
    SELECT
        @level = @level + 1

    UPDATE A SET
        level = @level,
        block_id = B.block_id
    FROM @tb_block A, @tb_block B
    WHERE A.blocked = B.spid
        AND A.level = 0
        AND A.block_id = 0
        AND B.level = @level - 1
        AND B.block_id > 0

    SELECT
        @rows = @@ROWCOUNT
END

-- ================================================
-- only keep waittime >= @time_lock
-- ================================================
DELETE A
FROM @tb_block A
WHERE NOT EXISTS(
        SELECT * FROM @tb_block
        WHERE block_id = A.block_id
            AND waittime >= @time_lock * 1000)

-- ================================================
-- Get SQL Script
-- ================================================
IF OBJECT_ID(N'tempdb..#LockSQL') IS NOT NULL
    DROP TABLE #LockSQL

CREATE TABLE #LockSQL(
    EventType nvarchar(30), 
    spid int,
    sql nvarchar(4000),
    id int IDENTITY
        PRIMARY KEY)

DECLARE 
    @spid smallint,
    @sql nvarchar(4000)

DECLARE CUR_Lock CURSOR FORWARD_ONLY READ_ONLY LOCAL
FOR
SELECT DISTINCT 
    spid
FROM @tb_block
OPEN CUR_Lock
FETCH CUR_Lock INTO @spid
WHILE @@FETCH_STATUS = 0
BEGIN
    SELECT
        @sql = N'
INSERT #LockSQL(
    EventType, spid, sql) 
EXEC(N''DBCC INPUTBUFFER(' + RTRIM(@spid) + N')
WITH NO_INFOMSGS'')
'
    EXEC sp_executesql @sql

    IF @@ROWCOUNT > 0
    BEGIN
        UPDATE A SET 
            spid = @spid
        FROM #LockSQL A
        WHERE IDENTITYCOL = @@IDENTITY
    END

    FETCH CUR_Lock INTO @spid
END
CLOSE CUR_Lock
DEALLOCATE CUR_Lock

-- ================================================
-- Result
-- ================================================
;WITH
BLOCK AS(
    -- Block spid
    SELECT 
        L.ID,
        P.sql_handle,

        L.block_id,
        L.spid , 
        L.blocked, 

        P.waittype,
        waittime = CONVERT(bigint, P.waittime), 
        lastwaittype = CONVERT(nchar(32), LEFT(P.lastwaittype, LEN(P.lastwaittype) - 1)),
        waitresource = RTRIM(CONVERT(nvarchar(256), P.waitresource)),
        DbName = DB.name,
        hostname = RTRIM(CONVERT(nvarchar(256), P.hostname)),
        program_name = RTRIM(CONVERT(nvarchar(256), P.program_name)),
        hostprocess = CONVERT(nchar(10), P.hostprocess),
        loginame = CONVERT(nvarchar(256), P.loginame),  
        P.login_time, 
        nt_domain = RTRIM(CONVERT(nvarchar(256), P.nt_domain)),
        P.net_address,
        P.open_tran,  
        P.status, 
        cputime = P.cpu
    FROM @tb_block L
        INNER JOIN master.dbo.sysprocesses P WITH(NOLOCK)
            ON L.spid = P.spid
        INNER JOIN sys.databases DB WITH(NOLOCK)
            ON P.dbid = DB.database_id
),
BLSQL1 AS(
    -- block sql - 1
    SELECT
        spid,
        sql
    FROM #LockSQL
),
BLSQL2_PL AS(
    SELECT DISTINCT
        P.spid,
        P.sql_handle, P.stmt_start, P.stmt_end
    FROM master.dbo.sysprocesses P WITH(NOLOCK), @tb_block L
    WHERE P.spid = L.spid
        AND P.sql_handle > 0x
),
BLSQL2 AS(
    SELECT
        PL.spid,
        PL.sql_handle,
        sql_text = PT.text,
        sql_current =  SUBSTRING(
                    PT.text,
                    (PL.stmt_start / 2) + 1,
                    CASE PL.stmt_end
                        WHEN - 1 THEN LEN(PT.text)
                        ELSE (PL.stmt_end - PL.stmt_start) / 2 + 1
                    END)
    FROM BLSQL2_PL PL
        CROSS APPLY sys.dm_exec_sql_text(sql_handle) PT
),
BLPATH AS(
    SELECT 
        ID, block_id, spid, blocked, level,
        path = CONVERT(varchar(4000), RIGHT(1000 + block_id, 3))
    FROM @tb_block A
    WHERE NOT EXISTS(
            SELECT * FROM @tb_block
            WHERE block_id = A.block_id
                AND A.blocked = spid)
    UNION ALL
    SELECT
        A.ID, A.block_id, A.spid, A.blocked, A.level,
        path = CONVERT(varchar(4000), B.path + RIGHT(1000 + A.spid, 3))
    FROM @tb_block A, BLPATH B
    WHERE A.block_id = B.block_id
        AND A.blocked = B.spid
        AND B.blocked <> B.spid
)
SELECT
    Path = SPACE(BLPATH.level * 2) + N'|-' + RTRIM(level) + N'-',
    BLOCK.block_id,
    BLOCK.spid , 
    BLOCK.blocked, 

    BLOCK.waittype,
    waittimeFMT = RTRIM(BLOCK.waittime / 3600000)
        + ':'+ RIGHT(100 + (BLOCK.waittime / 60000) % 60, 2) 
        + ':'+ RIGHT(100 + (BLOCK.waittime / 1000) % 60, 2) 
        + '.'+ RIGHT(1000 + BLOCK.waittime % 1000, 3), 
    BLOCK.waittime, 
    BLOCK.lastwaittype,
    BLOCK.waitresource,
    BLOCK.DbName,
    BLOCK.hostname,
    BLOCK.program_name,
    BLOCK.hostprocess,
    BLOCK.loginame,
    BLOCK.login_time, 
    BLOCK.nt_domain,
    BLOCK.net_address,
    BLOCK.open_tran,  
    BLOCK.status, 
    BLOCK.cputime,

    BLSQL1.sql,

    BLSQL2.sql_text,
    BLSQL2.sql_current
FROM BLOCK
    INNER JOIN BLPATH
        ON BLOCK.ID = BLPATH.ID
    LEFT JOIN BLSQL1
        ON BLOCK.spid = BLSQL1.spid
    LEFT JOIN BLSQL2
        ON BLOCK.spid = BLSQL2.spid
            AND BLOCK.sql_handle = BLSQL2.sql_handle        
ORDER BY BLPATH.path

SQLServer-TEMPDB性能问题排查
很长的阻塞链(600+),逐步看最多达到4层阻塞

几乎每个session都在等PAGELATCH_UP(lastwaittype)对应资源是 2:*:2(waitresource)

SQLServer的 LATCH和PAGELATCH:Latch是SQL server内部用来同步资源访问的一个数据结构,和操作系统的critical section类似,Latch保护了那些想保护的资源,使得访问同步有序;PAGEPATCH是同步访问数据库PAGE的Latch,SQL server的BP里每个数据库页(8K)都有一个对应的LATCH,要访问某个PAGE必须首先获得这个PAGE的LATCH,PAGELATCH有很多种,如共享的PAGELATCH_SH,独占的PAGELATCH_EX,更新的PAGELATCH_UP。

waitresource 2:*:2 分别表示database_id,file_id,page_id 对应资源是tempdb的某个datafile第二个数据页;了解SQLServer的存储结构可以知道datafile的前几个page是固定的系统page,第二个PAGE既是固定的全局分配映射页(GAM),TEMPDB做统一区分配的时候会用到;

分析到这里我们可以确认是TEMPDB的压力过大、页分配不够及时导致的卡慢问题;

对于不了解TEMPDB的同学可以参考下MSDN,和后续推论有关的是排序的中间结果集、行版本会用到TEMPDB;

关于TEMPDB的压力有如下几个可能性

可能性一

排查这个实例发现存在几个定时JOB,其中1个JOB是做索引重建功能的每两个小时执行一次,大致内容如下:


ALTER INDEX [***] ON [***].[***] REBUILD PARTITION = ALL WITH ( PAD_INDEX  = OFF, STATISTICS_NORECOMPUTE  = OFF, ALLOW_ROW_LOCKS  = ON, ALLOW_PAGE_LOCKS  = ON,  ONLINE = ON, SORT_IN_TEMPDB = OFF )

ONLINE=ON表示开启在线索引操作的特性,在线索引看起来很美但操作过程中创建的相关表的行版本对TEMPDB产生压力,特别情况还会产生死锁和空间耗尽的问题,具体过程可参考 在线索引

可能性二

阻塞链中抓取了被阻塞的SQL都类似,具体SQL内容涉及金融业务不再贴出,分析执行计划存在中间结果集排序操作会用到TEMPDB,和用户沟通确认这条SQL的执行频度当时有过调整,比正常情况高了2倍(正常5K、当时1.5W),因此怀疑是此SQL高并发导致;

可能性三

抓取现场的时候已经太晚了看到的都是结果,真正造成TEMPDB压力的SQL已经执行完成了;
SQLServer-TEMPDB性能问题排查

结论

可能性一和用户确认发生的时间和索引重建时间对不上->排除

可能性二后续做了进一步压测确认不是原因->排除

可能性三是最终确认的,SQL审计很重要

改善建议

首先检查TEMPDB当前的数据文件配置和CPU个数


SELECT 
    name AS FileName, 
    size*1.0/128 AS FileSizeinMB,
    CASE max_size 
        WHEN 0 THEN 'Autogrowth is off.'
        WHEN -1 THEN 'Autogrowth is on.'
        ELSE 'Log file will grow to a maximum size of 2 TB.'
    END AS Autogrowth,
    growth AS 'GrowthValue',
    'GrowthIncrement' = 
        CASE
            WHEN growth = 0 THEN 'Size is fixed and will not grow.'
            WHEN growth > 0 AND is_percent_growth = 0 
                THEN 'Growth value is in 8-KB pages.'
            ELSE 'Growth value is a percentage.'
        END
FROM tempdb.sys.database_files
WHERE type=0
GO    

select COUNT(*) as cpu_num from sys.dm_os_schedulers where is_online=1

SQLServer-TEMPDB性能问题排查

可以确认这是线上的正常配置,是比较通用的,但针对这个实例遇到的情况可以做进一步优化,增加TEMPDB datafile的数量保持和CPU个数一致(空间使用上需要再和用户同步),并且增加的datafile一定要和之前的的保持一致因为SQLServer对TEMPDB使用成比例填充算法来平衡所有datafile的可用空间,以此缓解datafile部分系统页的争抢;

另外配合成比例填充算法还有一个黑科技可再优化(1117),但对磁盘空间开销更大,这个最后在考虑

上一篇:MySql乱码解决(五)


下一篇:关于MS ODBC Driver的问题