分享U9夜里跑MRP有时中断排查方法

2025/04/26

问题

  • U9夜里1点自动跑MRP有时候出现中断

image-20250427195459139

分析

  • 查询当时的日志发现有出现错误“无法完成游标操作,因为在声明该游标后,表架构发生了更改”,发现这个时候正好MSSQL有一个任务整理索引碎片冲突导致。

重建索引在 SQL Server 内部本质上涉及表的结构重建和元数据更新,特别是聚集索引,会导致表的“架构版本”改变,这就是为什么SQL Server认为“表架构发生了变化”。只要有依赖原表结构的对象(比如游标),就会失效并报错。

解决方法

  • 调整索引碎片整理时间点+优化碎片整理语句
-- 检查是否有活动游标或长时间运行的查询
IF EXISTS (
    SELECT 1 
    FROM sys.dm_exec_sessions s
    JOIN sys.dm_exec_requests r ON s.session_id = r.session_id
    WHERE r.database_id = DB_ID()
      AND (r.command LIKE '%CURSOR%' OR r.total_elapsed_time > 300000) -- 检查游标操作或运行超过5分钟的查询
      AND s.session_id <> @@SPID
)
BEGIN
    PRINT '存在活动游标或长时间运行的查询,跳过索引维护以避免冲突';
    RETURN;
END

DECLARE @TableName NVARCHAR(128);
DECLARE @IndexName NVARCHAR(128);
DECLARE @SchemaName NVARCHAR(128);
DECLARE @Fragmentation FLOAT;
DECLARE @SQL NVARCHAR(MAX);
DECLARE @AllowOnline BIT;
DECLARE @TableID INT;
DECLARE @RowCount BIGINT;
DECLARE @MaxTableSize BIGINT = 10000000; -- 设置表大小阈值,超过此值的大表将单独处理

-- 创建临时表存储需要处理的索引信息
IF OBJECT_ID('tempdb..#IndexesToProcess') IS NOT NULL
    DROP TABLE #IndexesToProcess;

CREATE TABLE #IndexesToProcess (
    ID INT IDENTITY(1,1) PRIMARY KEY,
    SchemaName NVARCHAR(128),
    TableName NVARCHAR(128),
    IndexName NVARCHAR(128),
    Fragmentation FLOAT,
    TableSize BIGINT,
    AllowOnline BIT DEFAULT 0,
    Processed BIT DEFAULT 0
);

-- 收集需要处理的索引信息
INSERT INTO #IndexesToProcess (SchemaName, TableName, IndexName, Fragmentation, TableSize)
SELECT 
    sch.name AS SchemaName,
    tbl.name AS TableName,
    idx.name AS IndexName,
    ps.avg_fragmentation_in_percent,
    SUM(p.rows) AS TableSize
FROM 
    sys.dm_db_index_physical_stats(DB_ID(), NULL, NULL, NULL, 'LIMITED') AS ps
    INNER JOIN sys.indexes AS idx
        ON ps.object_id = idx.object_id AND ps.index_id = idx.index_id
    INNER JOIN sys.tables AS tbl
        ON ps.object_id = tbl.object_id
    INNER JOIN sys.schemas AS sch
        ON tbl.schema_id = sch.schema_id
    INNER JOIN sys.partitions AS p
        ON idx.object_id = p.object_id AND idx.index_id = p.index_id
WHERE 
    ps.avg_fragmentation_in_percent > 10  -- 碎片率大于10%
    AND idx.type_desc IN ('CLUSTERED', 'NONCLUSTERED')  -- 只处理聚集和非聚集索引
GROUP BY
    sch.name, tbl.name, idx.name, ps.avg_fragmentation_in_percent
ORDER BY 
    ps.avg_fragmentation_in_percent DESC;

-- 更新是否允许联机操作的标志
UPDATE i
SET AllowOnline = CASE 
    WHEN subquery.count = 0 THEN 1
    ELSE 0
END
FROM #IndexesToProcess i
CROSS APPLY (
    SELECT COUNT_BIG(*) AS count
    FROM sys.index_columns ic
    INNER JOIN sys.columns c
        ON ic.object_id = c.object_id AND ic.column_id = c.column_id
    INNER JOIN sys.types t
        ON c.user_type_id = t.user_type_id
    WHERE ic.object_id = OBJECT_ID(QUOTENAME(i.SchemaName) + '.' + QUOTENAME(i.TableName))
        AND ic.index_id = (
            SELECT index_id 
            FROM sys.indexes 
            WHERE object_id = ic.object_id AND name = i.IndexName
        )
        AND t.name IN ('text', 'ntext', 'image', 'filestream')
) AS subquery;

-- 先处理小表
DECLARE IndexCursor CURSOR FOR
SELECT ID, SchemaName, TableName, IndexName, Fragmentation, AllowOnline
FROM #IndexesToProcess
WHERE TableSize <= @MaxTableSize
ORDER BY Fragmentation DESC;

OPEN IndexCursor;
FETCH NEXT FROM IndexCursor INTO @TableID, @SchemaName, @TableName, @IndexName, @Fragmentation, @AllowOnline;

WHILE @@FETCH_STATUS = 0
BEGIN
    -- 再次检查是否有活动游标
    IF EXISTS (
        SELECT 1 
        FROM sys.dm_exec_sessions s
        JOIN sys.dm_exec_requests r ON s.session_id = r.session_id
        WHERE r.database_id = DB_ID()
          AND (r.command LIKE '%CURSOR%' OR r.total_elapsed_time > 300000)
          AND s.session_id <> @@SPID
    )
    BEGIN
        PRINT '检测到活动游标或长时间运行的查询,跳过当前索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
        GOTO NextIndex;
    END

    -- 检查表是否正在被使用
    IF EXISTS (
        SELECT 1
        FROM sys.dm_tran_locks l
        JOIN sys.partitions p ON l.resource_associated_entity_id = p.hobt_id
        JOIN sys.objects o ON p.object_id = o.object_id
        JOIN sys.schemas s ON o.schema_id = s.schema_id
        WHERE s.name = @SchemaName AND o.name = @TableName
          AND l.request_session_id <> @@SPID
    )
    BEGIN
        PRINT '表正在被其他会话使用,跳过当前索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
        GOTO NextIndex;
    END

    IF @Fragmentation BETWEEN 10 AND 30
    BEGIN
        -- 碎片率在10%到30%之间,选择重组索引
        SET @SQL = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REORGANIZE;';
        PRINT '重组索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName + ' 碎片率: ' + CAST(@Fragmentation AS NVARCHAR(10)) + '%';
    END
    ELSE IF @Fragmentation > 30
    BEGIN
        -- 碎片率大于30%,选择重建索引
        IF @AllowOnline = 1
        BEGIN
            SET @SQL = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REBUILD WITH (FILLFACTOR = 80, SORT_IN_TEMPDB = ON, ONLINE = ON);';
            PRINT '重建索引(联机): ' + @SchemaName + '.' + @TableName + '.' + @IndexName + ' 碎片率: ' + CAST(@Fragmentation AS NVARCHAR(10)) + '%';
        END
        ELSE
        BEGIN
            SET @SQL = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REBUILD WITH (FILLFACTOR = 80, SORT_IN_TEMPDB = ON);';
            PRINT '重建索引(脱机): ' + @SchemaName + '.' + @TableName + '.' + @IndexName + ' 碎片率: ' + CAST(@Fragmentation AS NVARCHAR(10)) + '%';
        END
    END
    ELSE
    BEGIN
        -- 碎片率低于10%,不处理
        SET @SQL = NULL;
    END

    IF @SQL IS NOT NULL
    BEGIN
        BEGIN TRY
            EXEC sp_executesql @SQL;
            UPDATE #IndexesToProcess SET Processed = 1 WHERE ID = @TableID;
        END TRY
        BEGIN CATCH
            PRINT '处理索引时出错: ' + @SchemaName + '.' + @TableName + '.' + @IndexName + ' 错误信息: ' + ERROR_MESSAGE();
        END CATCH
    END

    NextIndex:
    FETCH NEXT FROM IndexCursor INTO @TableID, @SchemaName, @TableName, @IndexName, @Fragmentation, @AllowOnline;
END

CLOSE IndexCursor;
DEALLOCATE IndexCursor;

-- 最后处理大表(如果有时间和资源)
DECLARE LargeTableCursor CURSOR FOR
SELECT ID, SchemaName, TableName, IndexName, Fragmentation, AllowOnline
FROM #IndexesToProcess
WHERE TableSize > @MaxTableSize AND Processed = 0
ORDER BY Fragmentation DESC;

OPEN LargeTableCursor;
FETCH NEXT FROM LargeTableCursor INTO @TableID, @SchemaName, @TableName, @IndexName, @Fragmentation, @AllowOnline;

WHILE @@FETCH_STATUS = 0
BEGIN
    -- 再次检查是否有活动游标
    IF EXISTS (
        SELECT 1 
        FROM sys.dm_exec_sessions s
        JOIN sys.dm_exec_requests r ON s.session_id = r.session_id
        WHERE r.database_id = DB_ID()
          AND (r.command LIKE '%CURSOR%' OR r.total_elapsed_time > 300000)
          AND s.session_id <> @@SPID
    )
    BEGIN
        PRINT '检测到活动游标或长时间运行的查询,跳过大表索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
        GOTO NextLargeIndex;
    END

    -- 检查表是否正在被使用
    IF EXISTS (
        SELECT 1
        FROM sys.dm_tran_locks l
        JOIN sys.partitions p ON l.resource_associated_entity_id = p.hobt_id
        JOIN sys.objects o ON p.object_id = o.object_id
        JOIN sys.schemas s ON o.schema_id = s.schema_id
        WHERE s.name = @SchemaName AND o.name = @TableName
          AND l.request_session_id <> @@SPID
    )
    BEGIN
        PRINT '大表正在被其他会话使用,跳过当前索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
        GOTO NextLargeIndex;
    END

    PRINT '开始处理大表索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName + ' 碎片率: ' + CAST(@Fragmentation AS NVARCHAR(10)) + '%';

    IF @Fragmentation BETWEEN 10 AND 30
    BEGIN
        -- 碎片率在10%到30%之间,选择重组索引
        SET @SQL = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REORGANIZE;';
        PRINT '重组大表索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
    END
    ELSE IF @Fragmentation > 30
    BEGIN
        -- 碎片率大于30%,选择重建索引
        IF @AllowOnline = 1
        BEGIN
            SET @SQL = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REBUILD WITH (FILLFACTOR = 80, SORT_IN_TEMPDB = ON, ONLINE = ON, MAXDOP = 2);';
            PRINT '重建大表索引(联机): ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
        END
        ELSE
        BEGIN
            -- 对于大表,如果不支持联机重建,可以考虑只进行重组而不是重建
            SET @SQL = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REORGANIZE;';
            PRINT '大表不支持联机重建,改为重组索引: ' + @SchemaName + '.' + @TableName + '.' + @IndexName;
        END
    END
    ELSE
    BEGIN
        -- 碎片率低于10%,不处理
        SET @SQL = NULL;
    END

    IF @SQL IS NOT NULL
    BEGIN
        BEGIN TRY
            EXEC sp_executesql @SQL;
            UPDATE #IndexesToProcess SET Processed = 1 WHERE ID = @TableID;
        END TRY
        BEGIN CATCH
            PRINT '处理大表索引时出错: ' + @SchemaName + '.' + @TableName + '.' + @IndexName + ' 错误信息: ' + ERROR_MESSAGE();
        END CATCH
    END

    NextLargeIndex:
    FETCH NEXT FROM LargeTableCursor INTO @TableID, @SchemaName, @TableName, @IndexName, @Fragmentation, @AllowOnline;
END

CLOSE LargeTableCursor;
DEALLOCATE LargeTableCursor;

-- 清理临时表
DROP TABLE #IndexesToProcess;

PRINT '索引碎片整理完成';