查看oplog大小
handong1:PRIMARY> db.getReplicationInfo()
{
"logSizeMB" : 1000,
"usedMB" : 1.41,
"timeDiff" : 62833,
"timeDiffHours" : 17.45,
"tFirst" : "Thu Apr 29 2021 17:22:31 GMT+0800 (CST)",
"tLast" : "Fri Apr 30 2021 10:49:44 GMT+0800 (CST)",
"now" : "Fri Apr 30 2021 10:49:54 GMT+0800 (CST)"
}
可以看到oplog几乎没有数据。
查看副本集配置信息
handong1:PRIMARY> rs.conf()
{
"_id" : "handong1",
"version" : 44,
"protocolVersion" : NumberLong(1),
"writeConcernMajorityJournalDefault" : true,
"members" : [
{
"_id" : 3,
"host" : "mongo4:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
},
{
"_id" : 4,
"host" : "mongo3:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
},
{
"_id" : 5,
"host" : "mongo2:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
},
{
"_id" : 6,
"host" : "mongo1:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : true,
"priority" : 0,
"tags" : {
},
"slaveDelay" : NumberLong(7200),
"votes" : 1
}
],
"settings" : {
"chainingAllowed" : true,
"heartbeatIntervalMillis" : 2000,
"heartbeatTimeoutSecs" : 10,
"electionTimeoutMillis" : 10000,
"catchUpTimeoutMillis" : -1,
"catchUpTakeoverDelayMillis" : 30000,
"getLastErrorModes" : {
},
"getLastErrorDefaults" : {
"w" : 1,
"wtimeout" : 0
},
"replicaSetId" : ObjectId("6051ca21a1f8138a01c3b940")
}
}
可以看到mongo1开启了延迟复制,我设置的7200s。
主节点写入数据
handong1:PRIMARY> db.getReplicationInfo()
{
"logSizeMB" : 1000,
"usedMB" : 1000.77,
"timeDiff" : 10817,
"timeDiffHours" : 3,
"tFirst" : "Fri Apr 30 2021 11:05:21 GMT+0800 (CST)",
"tLast" : "Fri Apr 30 2021 14:05:38 GMT+0800 (CST)",
"now" : "Fri Apr 30 2021 14:05:45 GMT+0800 (CST)"
}
源端大量写入数据,可以看到oplog已经写满。接下来我们等延迟节点故障发生。
查看副本集状态
handong1:PRIMARY> rs.status()
{
"set" : "handong1",
"date" : ISODate("2021-04-30T06:12:07.964Z"),
"myState" : 1,
"term" : NumberLong(26),
"syncingTo" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"heartbeatIntervalMillis" : NumberLong(2000),
"majorityVoteCount" : 3,
"writeMajorityCount" : 3,
"optimes" : {
"lastCommittedOpTime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"lastCommittedWallTime" : ISODate("2021-04-30T06:11:58.901Z"),
"readConcernMajorityOpTime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"readConcernMajorityWallTime" : ISODate("2021-04-30T06:11:58.901Z"),
"appliedOpTime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"durableOpTime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"lastAppliedWallTime" : ISODate("2021-04-30T06:11:58.901Z"),
"lastDurableWallTime" : ISODate("2021-04-30T06:11:58.901Z")
},
"lastStableRecoveryTimestamp" : Timestamp(1619763090, 1),
"lastStableCheckpointTimestamp" : Timestamp(1619763090, 1),
"electionCandidateMetrics" : {
"lastElectionReason" : "stepUpRequestSkipDryRun",
"lastElectionDate" : ISODate("2021-04-29T10:44:16.113Z"),
"electionTerm" : NumberLong(26),
"lastCommittedOpTimeAtElection" : {
"ts" : Timestamp(1619693040, 1),
"t" : NumberLong(25)
},
"lastSeenOpTimeAtElection" : {
"ts" : Timestamp(1619693040, 1),
"t" : NumberLong(25)
},
"numVotesNeeded" : 3,
"priorityAtElection" : 1,
"electionTimeoutMillis" : NumberLong(10000),
"priorPrimaryMemberId" : 3,
"numCatchUpOps" : NumberLong(0),
"newTermStartDate" : ISODate("2021-04-29T10:44:16.131Z"),
"wMajorityWriteAvailabilityDate" : ISODate("2021-04-29T10:44:18.139Z")
},
"electionParticipantMetrics" : {
"votedForCandidate" : true,
"electionTerm" : NumberLong(25),
"lastVoteDate" : ISODate("2021-04-29T09:51:57.461Z"),
"electionCandidateMemberId" : 3,
"voteReason" : "",
"lastAppliedOpTimeAtElection" : {
"ts" : Timestamp(1619689910, 1),
"t" : NumberLong(24)
},
"maxAppliedOpTimeInSet" : {
"ts" : Timestamp(1619689910, 1),
"t" : NumberLong(24)
},
"priorityAtElection" : 1
},
"members" : [
{
"_id" : 3,
"name" : "mongo4:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 68644,
"optime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"optimeDurable" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"optimeDate" : ISODate("2021-04-30T06:11:58Z"),
"optimeDurableDate" : ISODate("2021-04-30T06:11:58Z"),
"lastHeartbeat" : ISODate("2021-04-30T06:12:06.441Z"),
"lastHeartbeatRecv" : ISODate("2021-04-30T06:12:07.057Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "",
"syncingTo" : "mongo2:27017",
"syncSourceHost" : "mongo2:27017",
"syncSourceId" : 5,
"infoMessage" : "",
"configVersion" : 44
},
{
"_id" : 4,
"name" : "mongo3:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 74984,
"optime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"optimeDate" : ISODate("2021-04-30T06:11:58Z"),
"syncingTo" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"electionTime" : Timestamp(1619693056, 1),
"electionDate" : ISODate("2021-04-29T10:44:16Z"),
"configVersion" : 44,
"self" : true,
"lastHeartbeatMessage" : ""
},
{
"_id" : 5,
"name" : "mongo2:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 74872,
"optime" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"optimeDurable" : {
"ts" : Timestamp(1619763118, 1),
"t" : NumberLong(26)
},
"optimeDate" : ISODate("2021-04-30T06:11:58Z"),
"optimeDurableDate" : ISODate("2021-04-30T06:11:58Z"),
"lastHeartbeat" : ISODate("2021-04-30T06:12:07.665Z"),
"lastHeartbeatRecv" : ISODate("2021-04-30T06:12:06.064Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "",
"syncingTo" : "mongo3:27017",
"syncSourceHost" : "mongo3:27017",
"syncSourceId" : 4,
"infoMessage" : "",
"configVersion" : 44
},
{
"_id" : 6,
"name" : "mongo1:27017",
"health" : 1,
"state" : 3,
"stateStr" : "RECOVERING",
"uptime" : 74426,
"optime" : {
"ts" : Timestamp(1619751554, 1000),
"t" : NumberLong(26)
},
"optimeDurable" : {
"ts" : Timestamp(1619751554, 1000),
"t" : NumberLong(26)
},
"optimeDate" : ISODate("2021-04-30T02:59:14Z"),
"optimeDurableDate" : ISODate("2021-04-30T02:59:14Z"),
"lastHeartbeat" : ISODate("2021-04-30T06:12:07.004Z"),
"lastHeartbeatRecv" : ISODate("2021-04-30T06:12:07.750Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "",
"syncingTo" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"configVersion" : 44
}
],
"ok" : 1,
"$clusterTime" : {
"clusterTime" : Timestamp(1619763118, 1),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : NumberLong(0)
}
},
"operationTime" : Timestamp(1619763118, 1)
}
mongo1的状态是RECOVERING。
查看日志
2021-04-30T12:58:37.323+0800 I REPL [replication-28] Restarting oplog query due to error: CursorNotFound: error in fetcher batch callback :: caused by :: cursor id 8075047242504228364 not found. Last fetched optime: { ts: Timestamp(1619751554, 1000), t: 26 }. Restarts remaining: 1
2021-04-30T12:58:37.324+0800 I REPL [replication-28] Scheduled new oplog query Fetcher source: mongo3:27017 database: local query: { find: "oplog.rs", filter: { ts: { $gte: Timestamp(1619751554, 1000) } }, tailable: true, oplogReplay: true, awaitData: true, maxTimeMS: 2000, batchSize: 13981010, term: 26, readConcern: { afterClusterTime: Timestamp(0, 1) } } query metadata: { $replData: 1, $oplogQueryData: 1, $readPreference: { mode: "secondaryPreferred" } } active: 1 findNetworkTimeout: 7000ms getMoreNetworkTimeout: 10000ms shutting down?: 0 first: 1 firstCommandScheduler: RemoteCommandRetryScheduler request: RemoteCommand 6893257 -- target:mongo3:27017 db:local cmd:{ find: "oplog.rs", filter: { ts: { $gte: Timestamp(1619751554, 1000) } }, tailable: true, oplogReplay: true, awaitData: true, maxTimeMS: 2000, batchSize: 13981010, term: 26, readConcern: { afterClusterTime: Timestamp(0, 1) } } active: 1 callbackHandle.valid: 1 callbackHandle.cancelled: 0 attempt: 1 retryPolicy: {type: "NoRetryPolicy"}
2021-04-30T12:58:38.111+0800 I REPL [rsBackgroundSync] Starting rollback due to OplogStartMissing: Our last optime fetched: { ts: Timestamp(1619751554, 1000), t: 26 }. source‘s GTE: { ts: Timestamp(1619751921, 809), t: 26 }
2021-04-30T12:58:38.112+0800 I REPL [rsBackgroundSync] Replication commit point: { ts: Timestamp(1619758717, 1), t: 26 }
2021-04-30T12:58:38.112+0800 I REPL [rsBackgroundSync] Waiting for all operations from { ts: Timestamp(1619751517, 1000), t: 26 } until { ts: Timestamp(1619751554, 1000), t: 26 } to be applied before starting rollback.
2021-04-30T12:59:15.545+0800 I REPL [rsBackgroundSync] Rollback using ‘recoverToStableTimestamp‘ method.
2021-04-30T12:59:15.546+0800 I REPL [rsBackgroundSync] Scheduling rollback (sync source: mongo3:27017)
2021-04-30T12:59:15.546+0800 I ROLLBACK [rsBackgroundSync] transition to ROLLBACK
2021-04-30T12:59:15.546+0800 I REPL [rsBackgroundSync] State transition ops metrics: { lastStateTransition: "rollback", userOpsKilled: 0, userOpsRunning: 7 }
2021-04-30T12:59:15.546+0800 I REPL [rsBackgroundSync] transition to ROLLBACK from SECONDARY
可以看到由于延迟节点的oplog被覆盖出现了复制错误。
问题总结
出现这样的问题,一般有两种原因:
1、副本节点oplog设置太小
2、设置了延迟复制
解决方法
1、调大副本集oplog
2、优化延迟复制时间
3、重新添加副本集节点