pthread_mutex 引起的 core dump + 学习 gdb

示例

#include <stdio.h>    
#include <unistd.h>    
#include "pthread.h"    

pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;      
 
void * process(void * arg)    
{    
    fprintf(stderr, "Starting process %s\n", (char *) arg);    
  
    while (1) {  
        /* 加锁等待某些资源 */  
        pthread_mutex_lock(&lock);  
        fprintf(stderr, "Process %s lock mutex\n", (char *) arg);    
        /* 加锁成功表示资源就绪 */  
        usleep(1000);  
        /* do something */  
    }  
  
    return NULL;    
}    
  
int main(void)    
{    
    pthread_t th_a, th_b;    
    int ret = 0;    
    
    ret = pthread_create(&th_a, NULL, process, "a");    
    if (ret != 0) fprintf(stderr, "create a failed %d\n", ret);    
    
    ret = pthread_create(&th_b, NULL, process, "b");    
    if (ret != 0) fprintf(stderr, "create b failed %d\n", ret);    
    
    while (1) {  
        /* 等待并检测某些资源就绪 */  
        /* something */  
        /* 解锁告知线程资源就绪 */  
        pthread_mutex_unlock(&lock);   
        fprintf(stderr, "Main Process unlock mutex\n");    
    }  
  
    return 0;    
}

//====================================================================================

开启 gdb

root@ubuntu:/corefile# gdb ./test2/main core-main-4747-1624774848 
Reading symbols from ./test2/main...done.
[New LWP 4749]
[New LWP 4748]
[New LWP 4747]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Core was generated by `./main'.
Program terminated with signal SIGABRT, Aborted.
#0  0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
    at ../sysdeps/unix/sysv/linux/raise.c:54
54	../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
[Current thread is 1 (Thread 0x7f45eab78700 (LWP 4749))]

上文指出
(1)三个线程:4747 、 4748 、 4749(对应 gdb 的 thread 1)
(2)core dump 爆在 4749

(gdb) bt
#0  0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
    at ../sysdeps/unix/sysv/linux/raise.c:54
#1  0x00007f45eb3b103a in __GI_abort () at abort.c:89
#2  0x00007f45eb3a7be7 in __assert_fail_base (fmt=<optimized out>, 
    assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0", 
    file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81, 
    function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:92
#3  0x00007f45eb3a7c92 in __GI___assert_fail (
    assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0", 
    file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81, 
    function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:101
#4  0x00007f45eb74df68 in __GI___pthread_mutex_lock (mutex=<optimized out>)
    at ../nptl/pthread_mutex_lock.c:81
#5  0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
#6  0x00007f45eb74b6ba in start_thread (arg=0x7f45eab78700) at pthread_create.c:333
#7  0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

上文指出
(4)thread 1 的卡死位置在 main.c:13
(5)卡死原因: 断言失败,entry=0x7f45eb757015 __data.__owner == 0>

(gdb) info line main.c:13
Line 13 of "main.c" starts at address 0x4007df <process+41>
   and ends at 0x4007e9 <process+51>.
(gdb) list *(0x4007df)
0x4007df is in process (main.c:13).
8	{    
9	    fprintf(stderr, "Starting process %s\n", (char *) arg);    
10	  
11	    while (1) {  
12	        /* 加锁等待某些资源 */  
13	        pthread_mutex_lock(&lock);  
14	        fprintf(stderr, "Process %s lock mutex\n", (char *) arg);    
15	        /* 加锁成功表示资源就绪 */  
16	        usleep(1000);  
17	        /* do something */  

或者用栈帧的方式看

(gdb) frame 5
#5  0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
13	        pthread_mutex_lock(&lock);  

上文指出
(6)thread 1 的具体卡死位置是 process 句柄中的 pthread_mutex_lock(&lock);

看下其他几个线程的位置:

(gdb) info threads
  Id   Target Id         Frame 
* 1    Thread 0x7f45eab78700 (LWP 4749) 0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
    at ../sysdeps/unix/sysv/linux/raise.c:54
  2    Thread 0x7f45eb379700 (LWP 4748) 0x00007f45eb44638d in nanosleep ()
    at ../sysdeps/unix/syscall-template.S:84
  3    Thread 0x7f45ebb6a700 (LWP 4747) __lll_unlock_wake_private ()
    at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:341
(gdb) thread 2
[Switching to thread 2 (Thread 0x7f45eb379700 (LWP 4748))]
#0  0x00007f45eb44638d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
84	../sysdeps/unix/syscall-template.S: No such file or directory.

(gdb) bt
#0  0x00007f45eb44638d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
#1  0x00007f45eb477e54 in usleep (useconds=<optimized out>) at ../sysdeps/posix/usleep.c:32
#2  0x0000000000400810 in process (arg=0x400990) at main.c:16
#3  0x00007f45eb74b6ba in start_thread (arg=0x7f45eb379700) at pthread_create.c:333
#4  0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

(gdb) thread
[Current thread is 2 (Thread 0x7f45eb379700 (LWP 4748))]

(gdb) info main.c:16
Undefined info command: "main.c:16".  Try "help info".
(gdb) info line main.c:16
Line 16 of "main.c" starts at address 0x400806 <process+80>
   and ends at 0x400810 <process+90>.

(gdb) list *(0x400810)
0x400810 is in process (main.c:18).
13	        pthread_mutex_lock(&lock);  
14	        fprintf(stderr, "Process %s lock mutex\n", (char *) arg);    
15	        /* 加锁成功表示资源就绪 */  
16	        usleep(1000);  
17	        /* do something */  
18	    }  
19	  
20	    return NULL;    
21	}    
22	  

上文指出:
(7)thread 2 是 4748 线程
(8)thread 2 卡在 main.c:16 的 process 句柄中的 usleep(1000);

(gdb) thread 3
[Switching to thread 3 (Thread 0x7f45ebb6a700 (LWP 4747))]
#0  __lll_unlock_wake_private () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:341
341	../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt
#0  __lll_unlock_wake_private () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:341
#1  0x00007f45eb3e8840 in _IO_acquire_lock_fct (p=<synthetic pointer>) at libioP.h:867
#2  __GI__IO_fwrite (buf=0x4009bc, size=1, count=26, fp=0x7f45eb73f540 <_IO_2_1_stderr_>)
    at iofwrite.c:37
#3  0x00000000004008d8 in main () at main.c:39

(gdb) thread
[Current thread is 3 (Thread 0x7f45ebb6a700 (LWP 4747))]

(gdb) info line main.c:39
Line 39 of "main.c" starts at address 0x4008ba <main+168>
   and ends at 0x4008d8 <main+198>.

(gdb) list *(0x4008ba)
0x4008ba is in main (main.c:39).
34	    while (1) {  
35	        /* 等待并检测某些资源就绪 */  
36	        /* something */  
37	        /* 解锁告知线程资源就绪 */  
38	        pthread_mutex_unlock(&lock);   
39	        fprintf(stderr, "Main Process unlock mutex\n");    
40	    }  
41	  
42	    return 0;    
43	}


上文指出:
(9)thread 3 是 4747 线程
(10)thread 3 待执行 main.c:39 的 main 的 fprintf

三个线程的当前动作:
thread 3 刚执行完 pthread_mutex_unlock
thread 2 拿着锁睡着了
thread 1 想要 pthread_mutex_lock 加锁,但是断言失败(mutex->__data.__owner == 0)

//====================================================================================

分析

问题(1)
thread 1 要加的锁跟 thread 2 拿着的锁是否是同一把锁?
(理想代码肯定看代码就知道,但是实际项目中,经常是好几把锁)

从栈帧入手

thread 1 
#0  0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
    at ../sysdeps/unix/sysv/linux/raise.c:54
#1  0x00007f45eb3b103a in __GI_abort () at abort.c:89
#2  0x00007f45eb3a7be7 in __assert_fail_base (fmt=<optimized out>, 
    assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0", 
    file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81, 
    function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:92
#3  0x00007f45eb3a7c92 in __GI___assert_fail (
    assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0", 
    file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81, 
    function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:101
#4  0x00007f45eb74df68 in __GI___pthread_mutex_lock (mutex=<optimized out>)
    at ../nptl/pthread_mutex_lock.c:81
#5  0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
#6  0x00007f45eb74b6ba in start_thread (arg=0x7f45eab78700) at pthread_create.c:333
#7  0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

函数代码

void * process(void * arg)    
{    
    fprintf(stderr, "Starting process %s\n", (char *) arg);    
  
    while (1) {  
        /* 加锁等待某些资源 */  
        pthread_mutex_lock(&lock);  
        fprintf(stderr, "Process %s lock mutex\n", (char *) arg);    
        /* 加锁成功表示资源就绪 */  
        usleep(1000);  
        /* do something */  
    }  
  
    return NULL;    
} 

thread 1 的 frame 5 是 process 上下文
thread 1 的 frame 4 是 process 调用的 pthread_mutex_lock() 上下文,即 __GI___pthread_mutex_lock 上下文

pthread_mutex_lock() 的形参只有一个 lock

进入 pthread_mutex_lock() 之前,一定会将 &lock 放入寄存器,
进入 __GI___pthread_mutex_lock() 之后,一定会用到 &lock ,所以它的值肯定也会保存在栈上

顺着这个思路:寻找 &lock 的值

打印 main 的汇编

(gdb) frame 6
#6  0x00007f45eb74b6ba in start_thread (arg=0x7f45eab78700) at pthread_create.c:333
333	pthread_create.c: No such file or directory.
(gdb) disas
... ...
End of assembler dump.

打印 process 的汇编

(gdb) frame 5
#5  0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
13	        pthread_mutex_lock(&lock);  
(gdb) disas
Dump of assembler code for function process:
   0x00000000004007b6 <+0>:	push   %rbp
   0x00000000004007b7 <+1>:	mov    %rsp,%rbp
   0x00000000004007ba <+4>:	sub    $0x10,%rsp
   0x00000000004007be <+8>:	mov    %rdi,-0x8(%rbp)
   0x00000000004007c2 <+12>:	mov    0x200897(%rip),%rax        # 0x601060 <stderr@@GLIBC_2.2.5>
   0x00000000004007c9 <+19>:	mov    -0x8(%rbp),%rdx
   0x00000000004007cd <+23>:	mov    $0x400964,%esi
   0x00000000004007d2 <+28>:	mov    %rax,%rdi
   0x00000000004007d5 <+31>:	mov    $0x0,%eax
   0x00000000004007da <+36>:	callq  0x400660 <fprintf@plt>
   0x00000000004007df <+41>:	mov    $0x601080,%edi   // 【把 0x601080 赋值给 edi 寄存器】
   0x00000000004007e4 <+46>:	callq  0x400690 <pthread_mutex_lock@plt>
=> 0x00000000004007e9 <+51>:	mov    0x200870(%rip),%rax        # 0x601060 <stderr@@GLIBC_2.2.5>
   0x00000000004007f0 <+58>:	mov    -0x8(%rbp),%rdx
   0x00000000004007f4 <+62>:	mov    $0x400979,%esi
   0x00000000004007f9 <+67>:	mov    %rax,%rdi
   0x00000000004007fc <+70>:	mov    $0x0,%eax
   0x0000000000400801 <+75>:	callq  0x400660 <fprintf@plt>
   0x0000000000400806 <+80>:	mov    $0x3e8,%edi
   0x000000000040080b <+85>:	callq  0x4006a0 <usleep@plt>
   0x0000000000400810 <+90>:	jmp    0x4007df <process+41>
End of assembler dump.

由上面得出, 0x601080 应该就是 lock 的地址了
用查看内存的方式看下:

(gdb) x 0x601080
0x601080 <lock>:	""

(提前剧透一下,还真是,但这不是分析出来的)

(gdb) p &lock
$8 = (pthread_mutex_t *) 0x601080 <lock>

用 info args 的形式来捞下 pthread_mutex_lock 的实参

(gdb) frame 4
#4  0x00007f45eb74df68 in __GI___pthread_mutex_lock (mutex=<optimized out>)
    at ../nptl/pthread_mutex_lock.c:81
81	../nptl/pthread_mutex_lock.c: No such file or directory.
(gdb) info args
mutex = <optimized out>

捞不到

(gdb) info reg
寄存器名称     16进制值  10进制值  
rax            0x0	0
rbx            0x0	0
rcx            0x7f45eb3af438	139938275980344
rdx            0x6	6
rsi            0x128d	4749
rdi            0x128b	4747
rbp            0x7f45eab77f50	0x7f45eab77f50
rsp            0x7f45eab77f40	0x7f45eab77f40
r8             0x7f45e4000a70	139938154678896
r9             0xfefefeff092d6300	-72340172667264256
r10            0x8	8
r11            0x206	518
r12            0x0	0
r13            0x7fff058268ff	140733285820671
r14            0x7f45eab789c0	139938267367872
r15            0x0	0
rip            0x4007e9	0x4007e9 <process+51> // 【待执行的下一条语句】
eflags         0x206	[ PF IF ]
cs             0x33	51
ss             0x2b	43
ds             0x0	0
es             0x0	0
fs             0x0	0
gs             0x0	0

看下 thread 2 的 lock

thread 2
#0  0x00007f45eb44638d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
#1  0x00007f45eb477e54 in usleep (useconds=<optimized out>) at ../sysdeps/posix/usleep.c:32
#2  0x0000000000400810 in process (arg=0x400990) at main.c:16
#3  0x00007f45eb74b6ba in start_thread (arg=0x7f45eb379700) at pthread_create.c:333
#4  0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

同样的捞法,是同一把锁

//====================================================================================
查看 lock 出事时的内部信息,发现 __owner 和 __count 都清零了

(gdb) p lock
$7 = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 4294963709, 
    __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, 
  __size = '\000' <repeats 12 times>, "\375\361\377\377", '\000' <repeats 23 times>, 
  __align = 0}

但是 thread 1 在加锁时,却爆出来说: __owner != 0

这个解释还在思考中… …

//====================================================================================

参考博客

https://blog.csdn.net/yxtxiaotian/article/details/78609504#comments_16952964

int  __pthread_mutex_lock (mutex) {  
  assert (sizeof (mutex->__size) >= sizeof (mutex->__data));  
  
  unsigned int type = PTHREAD_MUTEX_TYPE (mutex);  
  if (__builtin_expect (type & ~PTHREAD_MUTEX_KIND_MASK_NP, 0))  
    return __pthread_mutex_lock_full (mutex);  
  
  pid_t id = THREAD_GETMEM (THREAD_SELF, tid);  
  
  if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)  
      == PTHREAD_MUTEX_TIMED_NP)                                //1---判断锁类型  
    {  
    simple:  
      /* Normal mutex.  */  
      LLL_MUTEX_LOCK (mutex);                                   //2---加锁(原子操作)  
      assert (mutex->__data.__owner == 0);                      //3---Owner判断  
    }  
      
  ...  
  
  /* Record the ownership.  */  
  mutex->__data.__owner = id;                                   //4---Owner赋值  
#ifndef NO_INCR  
  ++mutex->__data.__nusers;  
#endif  
  
  return 0;  
}

加锁函数的主要4步操作:
首先会判断锁的类型,这里仅对PTHREAD_MUTEX_TIMED_NP类型的锁做出分析,该该类型的锁为默认的锁类型,
当一个线程加锁后其余请求锁的线程会排入一个等待队列,并在锁解锁后按优先级获得锁。
然后程序调用LLT_MUTEX_LOCK()宏执行底层加锁动作,这个加锁流程是原子的且不同的架构实现并不相同,
然后会判断是否已经有线程获取了该锁(因为PTHREAD_MUTEX_TIMED_NP类型的锁是不允许嵌套加锁的),
若已经有线程获取了锁则出错退出(示例程序中就是在此出错的),
在函数的最后会把当前获得锁的线程号赋给__owner字段(线程与锁绑定)就结束了,
此时当前线程进入临界区,其他对锁请求的线程将阻塞。
pthread_mutex 引起的 core dump + 学习 gdb

int __pthread_mutex_unlock_usercnt (mutex, decr) {  
  int type = PTHREAD_MUTEX_TYPE (mutex);  
  if (__builtin_expect (type & ~PTHREAD_MUTEX_KIND_MASK_NP, 0))  
    return __pthread_mutex_unlock_full (mutex, decr);  
  
  if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)                //1---判断锁类型  
      == PTHREAD_MUTEX_TIMED_NP)  
    {  
      /* Always reset the owner field.  */  
    normal:  
      mutex->__data.__owner = 0;                                        //2---Owner解除  
      if (decr)  
    /* One less user.  */  
    --mutex->__data.__nusers;  
  
      /* Unlock.  */  
      lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex)); //3---原子解锁  
      return 0;  
    }  
      
    ...  
}

解锁函数的3步:
首先依旧是判断锁类型,然后解除锁和线程的绑定关系,
最后就调用lll_unlock()函数原子的解锁,此时若有加锁线程需要获取锁,
相应线程会从LLT_MUTEX_LOCK()函数返回继续执行。

这两个函数的执行并不是原子的,是可能存在上下文切换动作的。
在通常的用法中,加锁操作一般都是为了保护临界资源不被重入改写,
一般都是严格按照“加锁–>写入/读取临界资源–>解锁”的流程执行(由加锁的线程负责解锁),
pthread_mutex 引起的 core dump + 学习 gdb
加锁:
(1) LLL_MUTEX_LOCK
(2) assert (mutex->__data.__owner == 0)
(3) 若为0就执行 mutex->__data.__owner = id;
(4) 若不为0就 EE

解锁
(1) mutex->__data.__owner = 0
(2) lll_unlock

如下三个场景,绝对造成 EE
pthread_mutex 引起的 core dump + 学习 gdb
pthread_mutex 引起的 core dump + 学习 gdb
pthread_mutex 引起的 core dump + 学习 gdb
其实不论上述哪一种同步的情况,其出错的原因有两点:
(1)解了未被上锁的锁;
(2)A线程加的锁由其他线程去解,进一步分析就是没有严格按照“加锁–>解锁”的流程使用mutex锁。

最后对于以上这种“线程间同步”的使用方法可以使用条件变量或者是信号量实现而不要使用mutex锁,mutex锁一般被用在保护线程间临界资源的情况下。

总结:
1、不要去解锁一个未被加锁的mutex锁;
2、不要一个线程中加锁而在另一个线程中解锁;
3、使用mutex锁用于保护临界资源,严格按照“加锁–>写入/读取临界资源–>解锁”的流程执行,对于线程间同步的需求使用条件变量或信号量实现。

//========================================================================

本人实测

(1) A线程加锁,B线程解锁,不会导致 core dump

#include <stdio.h>    
#include <unistd.h>    
#include "pthread.h"    

pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;      
 
void * process1(void * arg) {
    fprintf(stderr, "Starting process %s\n", (char *) arg);
    pthread_mutex_lock(&lock);
    fprintf(stderr, "Process %s lock mutex\n", (char *) arg);
    sleep(10);

    fprintf(stderr, "Process %s exit\n", (char *) arg);
    return NULL;
}

void * process2(void * arg) {    
    fprintf(stderr, "Starting process %s\n", (char *) arg);
    sleep(4);
    pthread_mutex_unlock(&lock);
    fprintf(stderr, "Process %s exit\n", (char *) arg);
    return NULL;
}
 
int main(void) {
    pthread_t th_a, th_b;
    int ret = 0;
    void *tmp;

    ret = pthread_create(&th_a, NULL, process1, "a");
    if (ret != 0) fprintf(stderr, "create a failed %d\n", ret);

    ret = pthread_create(&th_b, NULL, process2, "b");
    if (ret != 0) fprintf(stderr, "create b failed %d\n", ret);

    pthread_join(th_a, &tmp);
    pthread_join(th_b, &tmp);
    return 0;
}

(2) 加锁一次,释放锁 N 次,不会导致 core dump

上一篇:libsysutils_module


下一篇:手机号和邮箱的验证