vacuum freeze无法回收事务号问题分析

在template1数据库中进行vacuum freeze出现如下报错:

template1=> vacuum freeze template1.pg_catalog.pg_authid;
ERROR:  found xmin 1988747257 from before relfrozenxid 2810153180

问题原因

出现这种报错一般第一反应都是因为长事务引起的事务号无法回收,通过查询pg_stat_activity没有发现有长事务,而且是单表无法进行回收。原因是pg会把一些关键的系统表的元数据信息存储在relcache中,rd_isnailed=true就是表明这个表的relcache的relcache是不会去进行更新的,关键的系统表一般也不会去进行表元数据的变更,所以系统表不去更新relcache也是正常的

typedef struct RelationData
{
    RelFileNode rd_node;        /* relation physical identifier */
    /* use "struct" here to avoid needing to include smgr.h: */
    struct SMgrRelationData *rd_smgr;    /* cached file handle, or NULL */
    int            rd_refcnt;        /* reference count */
    BackendId    rd_backend;        /* owning backend id, if temporary relation */
    bool        rd_islocaltemp; /* rel is a temp rel of this session */
    bool        rd_isnailed;    /* rel is nailed in cache */
    bool        rd_isvalid;        /* relcache entry is valid */
    char        rd_indexvalid;    /* state of rd_indexlist: 0 = not valid, 1 =
                                 * valid, 2 = temporarily forced */
    bool        rd_statvalid;    /* is rd_statlist valid? */

    /*
     * rd_createSubid is the ID of the highest subtransaction the rel has
     * survived into; or zero if the rel was not created in the current top
     * transaction.  This can be now be relied on, whereas previously it could
     * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
     * the ID of the highest subtransaction the relfilenode change has
     * survived into, or zero if not changed in the current transaction (or we
     * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
     * when a relation has multiple new relfilenodes within a single
     * transaction, with one of them occurring in a subsequently aborted
     * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;
     * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten
     */
    SubTransactionId rd_createSubid;    /* rel was created in current xact */
    SubTransactionId rd_newRelfilenodeSubid;    /* new relfilenode assigned in
                                                 * current xact */

    Form_pg_class rd_rel;        /* RELATION tuple */
    TupleDesc    rd_att;            /* tuple descriptor */
    Oid            rd_id;            /* relation's object id */
    LockInfoData rd_lockInfo;    /* lock mgr's info for locking relation */
    RuleLock   *rd_rules;        /* rewrite rules */
    MemoryContext rd_rulescxt;    /* private memory cxt for rd_rules, if any */
    TriggerDesc *trigdesc;        /* Trigger info, or NULL if rel has none */
    /* use "struct" here to avoid needing to include rowsecurity.h: */
    struct RowSecurityDesc *rd_rsdesc;    /* row security policies, or NULL */

    /* data managed by RelationGetFKeyList: */
    List       *rd_fkeylist;    /* list of ForeignKeyCacheInfo (see below) */
    bool        rd_fkeyvalid;    /* true if list has been computed */

    MemoryContext rd_partkeycxt;    /* private memory cxt for the below */
    struct PartitionKeyData *rd_partkey;    /* partition key, or NULL */
    MemoryContext rd_pdcxt;        /* private context for partdesc */
    struct PartitionDescData *rd_partdesc;    /* partitions, or NULL */
    List       *rd_partcheck;    /* partition CHECK quals */

    /* data managed by RelationGetIndexList: */
    List       *rd_indexlist;    /* list of OIDs of indexes on relation */
    Oid            rd_pkindex;        /* OID of primary key, if any */
    Oid            rd_replidindex; /* OID of replica identity index, if any */

    /* data managed by RelationGetStatExtList: */
    List       *rd_statlist;    /* list of OIDs of extended stats */

    /* data managed by RelationGetIndexAttrBitmap: */
    Bitmapset  *rd_indexattr;    /* identifies columns used in indexes */
    Bitmapset  *rd_keyattr;        /* cols that can be ref'd by foreign keys */
    Bitmapset  *rd_pkattr;        /* cols included in primary key */
    Bitmapset  *rd_idattr;        /* included in replica identity index */

    PublicationActions *rd_pubactions;    /* publication actions */

    /*
     * rd_options is set whenever rd_rel is loaded into the relcache entry.
     * Note that you can NOT look into rd_rel for this data.  NULL means "use
     * defaults".
     */
    bytea       *rd_options;        /* parsed pg_class.reloptions */

    /* These are non-NULL only for an index relation: */
    Form_pg_index rd_index;        /* pg_index tuple describing this index */
    /* use "struct" here to avoid needing to include htup.h: */
    struct HeapTupleData *rd_indextuple;    /* all of pg_index tuple */

    /*
     * index access support info (used only for an index relation)
     *
     * Note: only default support procs for each opclass are cached, namely
     * those with lefttype and righttype equal to the opclass's opcintype. The
     * arrays are indexed by support function number, which is a sufficient
     * identifier given that restriction.
     *
     * Note: rd_amcache is available for index AMs to cache private data about
     * an index.  This must be just a cache since it may get reset at any time
     * (in particular, it will get reset by a relcache inval message for the
     * index).  If used, it must point to a single memory chunk palloc'd in
     * rd_indexcxt.  A relcache reset will include freeing that chunk and
     * setting rd_amcache = NULL.
     */
    Oid            rd_amhandler;    /* OID of index AM's handler function */
    MemoryContext rd_indexcxt;    /* private memory cxt for this stuff */
    /* use "struct" here to avoid needing to include amapi.h: */
    struct IndexAmRoutine *rd_indam;    /* index AM's API struct */
    Oid           *rd_opfamily;    /* OIDs of op families for each index col */
    Oid           *rd_opcintype;    /* OIDs of opclass declared input data types */
    RegProcedure *rd_support;    /* OIDs of support procedures */
    FmgrInfo   *rd_supportinfo; /* lookup info for support procedures */
    int16       *rd_indoption;    /* per-column AM-specific flags */
    List       *rd_indexprs;    /* index expression trees, if any */
    List       *rd_indpred;        /* index predicate tree, if any */
    Oid           *rd_exclops;        /* OIDs of exclusion operators, if any */
    Oid           *rd_exclprocs;    /* OIDs of exclusion ops' procs, if any */
    uint16       *rd_exclstrats;    /* exclusion ops' strategy numbers, if any */
    void       *rd_amcache;        /* available for use by index AM */
    Oid           *rd_indcollation;    /* OIDs of index collations */

    /*
     * foreign-table support
     *
     * rd_fdwroutine must point to a single memory chunk palloc'd in
     * CacheMemoryContext.  It will be freed and reset to NULL on a relcache
     * reset.
     */

    /* use "struct" here to avoid needing to include fdwapi.h: */
    struct FdwRoutine *rd_fdwroutine;    /* cached function pointers, or NULL */

    /*
     * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
     * version of a table, we need to make any toast pointers inserted into it
     * have the existing toast table's OID, not the OID of the transient toast
     * table.  If rd_toastoid isn't InvalidOid, it is the OID to place in
     * toast pointers inserted into this rel.  (Note it's set on the new
     * version of the main heap, not the toast table itself.)  This also
     * causes toast_save_datum() to try to preserve toast value OIDs.
     */
    Oid            rd_toastoid;    /* Real TOAST table's OID, or InvalidOid */

    /* use "struct" here to avoid needing to include pgstat.h: */
    struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
} RelationData;

relfrozenxid存储在rd_rel中

CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,RelationRelation_Rowtype_Id) BKI_SCHEMA_MACRO
{
    Oid            oid;            /* oid */
    NameData    relname;        /* class name */
    Oid            relnamespace;    /* OID of namespace containing this class */
    Oid            reltype;        /* OID of entry in pg_type for table's
                                 * implicit row type */
    Oid            reloftype;        /* OID of entry in pg_type for underlying
                                 * composite type */
    Oid            relowner;        /* class owner */
    Oid            relam;            /* index access method; 0 if not an index */
    Oid            relfilenode;    /* identifier of physical storage file */

    /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
    Oid            reltablespace;    /* identifier of table space for relation */
    int32        relpages;        /* # of blocks (not always up-to-date) */
    float4        reltuples;        /* # of tuples (not always up-to-date) */
    int32        relallvisible;    /* # of all-visible blocks (not always
                                 * up-to-date) */
    Oid            reltoastrelid;    /* OID of toast table; 0 if none */
    bool        relhasindex;    /* T if has (or has had) any indexes */
    bool        relisshared;    /* T if shared across databases */
    char        relpersistence; /* see RELPERSISTENCE_xxx constants below */
    char        relkind;        /* see RELKIND_xxx constants below */
    int16        relnatts;        /* number of user attributes */

    /*
     * Class pg_attribute must contain exactly "relnatts" user attributes
     * (with attnums ranging from 1 to relnatts) for this class.  It may also
     * contain entries with negative attnums for system attributes.
     */
    int16        relchecks;        /* # of CHECK constraints for class */
    bool        relhasrules;    /* has (or has had) any rules */
    bool        relhastriggers; /* has (or has had) any TRIGGERs */
    bool        relhassubclass; /* has (or has had) child tables or indexes */
    bool        relrowsecurity; /* row security is enabled or not */
    bool        relforcerowsecurity;    /* row security forced for owners or
                                         * not */
    bool        relispopulated; /* matview currently holds query results */
    char        relreplident;    /* see REPLICA_IDENTITY_xxx constants  */
    bool        relispartition; /* is relation a partition? */
    Oid            relrewrite;        /* heap for rewrite during DDL, link to
                                 * original rel */
    TransactionId relfrozenxid; /* all Xids < this are frozen in this rel */
    TransactionId relminmxid;    /* all multixacts in this rel are >= this.
                                 * this is really a MultiXactId */

#ifdef CATALOG_VARLEN            /* variable-length fields start here */
    /* NOTE: These fields are not present in a relcache entry's rd_rel field. */
    aclitem        relacl[1];        /* access permissions */
    text        reloptions[1];    /* access-method-specific options */
    pg_node_tree relpartbound;    /* partition bound node tree */
#endif
} FormData_pg_class;

vacuum过程中这个函数会对这行的xid和relfrozenxid进行比较,如果xid早于relfrozenxid就会出现这个报错,

heap_prepare_freeze_tuple(HeapTupleHeader tuple,
                          TransactionId relfrozenxid, TransactionId relminmxid,
                          TransactionId cutoff_xid, TransactionId cutoff_multi,
                          xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
{
    bool        changed = false;
    bool        xmax_already_frozen = false;
    bool        xmin_frozen;
    bool        freeze_xmax;
    TransactionId xid;

    frz->frzflags = 0;
    frz->t_infomask2 = tuple->t_infomask2;
    frz->t_infomask = tuple->t_infomask;
    frz->xmax = HeapTupleHeaderGetRawXmax(tuple);

    /* Process xmin */
    xid = HeapTupleHeaderGetXmin(tuple);             #取当前行的xmin,会先判断是否是frozenxid,如果是就会等于FrozenTransactionId,否则就是当前行的xmin。
    xmin_frozen = ((xid == FrozenTransactionId) ||
                   HeapTupleHeaderXminFrozen(tuple));
    if (TransactionIdIsNormal(xid))
    {
        if (TransactionIdPrecedes(xid, relfrozenxid))          #如果xid早于relfrozenxid就会出现上述的报错
            ereport(ERROR,
                    (errcode(ERRCODE_DATA_CORRUPTED),
                     errmsg_internal("found xmin %u from before relfrozenxid %u",
                                     xid, relfrozenxid)));

        if (TransactionIdPrecedes(xid, cutoff_xid))
        {
            if (!TransactionIdDidCommit(xid))          
                ereport(ERROR,
                        (errcode(ERRCODE_DATA_CORRUPTED),
                         errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
                                         xid, cutoff_xid)));

            frz->t_infomask |= HEAP_XMIN_FROZEN;
            changed = true;
            xmin_frozen = true;
        }
    }
.....
}

xid是如何进行比较的呢,如果不超过21亿是正常比较,超过21亿的需要取模进行比较,如果行上的t_infomask中HEAP_XMIN_COMMITTED,HEAP_XMIN_INVALID同时存在就会被认为是FrozenTransactionId。

#define HEAP_XMIN_COMMITTED        0x0100    /* t_xmin committed */
#define HEAP_XMIN_INVALID        0x0200    /* t_xmin invalid/aborted */
#define HEAP_XMIN_FROZEN        (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)
#define InvalidTransactionId        ((TransactionId) 0)
#define BootstrapTransactionId        ((TransactionId) 1)
#define FrozenTransactionId            ((TransactionId) 2)
#define FirstNormalTransactionId    ((TransactionId) 3)
#define MaxTransactionId            ((TransactionId) 0xFFFFFFFF)

所以问题的根因就是vacuum是读到的relfrozexid来自relcache的,更新catalog中的relfrozexid不会去刷新relcache,导致一直读到的是错误的relfrozexid,所以出现vacuum freeze报错的问题。10.2,9.6.7,9.5.11,9.4.16到修复版本之间的版本的pg实例都会存在相关问题。

10.5, 9.6.10, 9.5.14, 9.4.19对这个问题进行了修复。https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=817f9f9a8a1932a0cd8c6bc5c9d3e77f6a80e659

问题解法

目前可以通过两种方式进行修复

1.重启数据库,重启后会重新读入新数据内容到relcache中,相当于刷新relcache。

2.删除$PGDATA/global/pg_internal.init,这个文件就是存储的relcache的内容,有新的连接连入会创建新的pg_internal.init文件。

上一篇:对未来的想法


下一篇:设计模式学习(二):软件设计与模式