示例
#include <stdio.h>
#include <unistd.h>
#include "pthread.h"
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
void * process(void * arg)
{
fprintf(stderr, "Starting process %s\n", (char *) arg);
while (1) {
/* 加锁等待某些资源 */
pthread_mutex_lock(&lock);
fprintf(stderr, "Process %s lock mutex\n", (char *) arg);
/* 加锁成功表示资源就绪 */
usleep(1000);
/* do something */
}
return NULL;
}
int main(void)
{
pthread_t th_a, th_b;
int ret = 0;
ret = pthread_create(&th_a, NULL, process, "a");
if (ret != 0) fprintf(stderr, "create a failed %d\n", ret);
ret = pthread_create(&th_b, NULL, process, "b");
if (ret != 0) fprintf(stderr, "create b failed %d\n", ret);
while (1) {
/* 等待并检测某些资源就绪 */
/* something */
/* 解锁告知线程资源就绪 */
pthread_mutex_unlock(&lock);
fprintf(stderr, "Main Process unlock mutex\n");
}
return 0;
}
//====================================================================================
开启 gdb
root@ubuntu:/corefile# gdb ./test2/main core-main-4747-1624774848
Reading symbols from ./test2/main...done.
[New LWP 4749]
[New LWP 4748]
[New LWP 4747]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Core was generated by `./main'.
Program terminated with signal SIGABRT, Aborted.
#0 0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
at ../sysdeps/unix/sysv/linux/raise.c:54
54 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
[Current thread is 1 (Thread 0x7f45eab78700 (LWP 4749))]
上文指出
(1)三个线程:4747 、 4748 、 4749(对应 gdb 的 thread 1)
(2)core dump 爆在 4749
(gdb) bt
#0 0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
at ../sysdeps/unix/sysv/linux/raise.c:54
#1 0x00007f45eb3b103a in __GI_abort () at abort.c:89
#2 0x00007f45eb3a7be7 in __assert_fail_base (fmt=<optimized out>,
assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0",
file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81,
function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:92
#3 0x00007f45eb3a7c92 in __GI___assert_fail (
assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0",
file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81,
function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:101
#4 0x00007f45eb74df68 in __GI___pthread_mutex_lock (mutex=<optimized out>)
at ../nptl/pthread_mutex_lock.c:81
#5 0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
#6 0x00007f45eb74b6ba in start_thread (arg=0x7f45eab78700) at pthread_create.c:333
#7 0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
上文指出
(4)thread 1 的卡死位置在 main.c:13
(5)卡死原因: 断言失败,entry=0x7f45eb757015 __data.__owner == 0>
(gdb) info line main.c:13
Line 13 of "main.c" starts at address 0x4007df <process+41>
and ends at 0x4007e9 <process+51>.
(gdb) list *(0x4007df)
0x4007df is in process (main.c:13).
8 {
9 fprintf(stderr, "Starting process %s\n", (char *) arg);
10
11 while (1) {
12 /* 加锁等待某些资源 */
13 pthread_mutex_lock(&lock);
14 fprintf(stderr, "Process %s lock mutex\n", (char *) arg);
15 /* 加锁成功表示资源就绪 */
16 usleep(1000);
17 /* do something */
或者用栈帧的方式看
(gdb) frame 5
#5 0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
13 pthread_mutex_lock(&lock);
上文指出
(6)thread 1 的具体卡死位置是 process 句柄中的 pthread_mutex_lock(&lock);
看下其他几个线程的位置:
(gdb) info threads
Id Target Id Frame
* 1 Thread 0x7f45eab78700 (LWP 4749) 0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
at ../sysdeps/unix/sysv/linux/raise.c:54
2 Thread 0x7f45eb379700 (LWP 4748) 0x00007f45eb44638d in nanosleep ()
at ../sysdeps/unix/syscall-template.S:84
3 Thread 0x7f45ebb6a700 (LWP 4747) __lll_unlock_wake_private ()
at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:341
(gdb) thread 2
[Switching to thread 2 (Thread 0x7f45eb379700 (LWP 4748))]
#0 0x00007f45eb44638d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
84 ../sysdeps/unix/syscall-template.S: No such file or directory.
(gdb) bt
#0 0x00007f45eb44638d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007f45eb477e54 in usleep (useconds=<optimized out>) at ../sysdeps/posix/usleep.c:32
#2 0x0000000000400810 in process (arg=0x400990) at main.c:16
#3 0x00007f45eb74b6ba in start_thread (arg=0x7f45eb379700) at pthread_create.c:333
#4 0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
(gdb) thread
[Current thread is 2 (Thread 0x7f45eb379700 (LWP 4748))]
(gdb) info main.c:16
Undefined info command: "main.c:16". Try "help info".
(gdb) info line main.c:16
Line 16 of "main.c" starts at address 0x400806 <process+80>
and ends at 0x400810 <process+90>.
(gdb) list *(0x400810)
0x400810 is in process (main.c:18).
13 pthread_mutex_lock(&lock);
14 fprintf(stderr, "Process %s lock mutex\n", (char *) arg);
15 /* 加锁成功表示资源就绪 */
16 usleep(1000);
17 /* do something */
18 }
19
20 return NULL;
21 }
22
上文指出:
(7)thread 2 是 4748 线程
(8)thread 2 卡在 main.c:16 的 process 句柄中的 usleep(1000);
(gdb) thread 3
[Switching to thread 3 (Thread 0x7f45ebb6a700 (LWP 4747))]
#0 __lll_unlock_wake_private () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:341
341 ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt
#0 __lll_unlock_wake_private () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:341
#1 0x00007f45eb3e8840 in _IO_acquire_lock_fct (p=<synthetic pointer>) at libioP.h:867
#2 __GI__IO_fwrite (buf=0x4009bc, size=1, count=26, fp=0x7f45eb73f540 <_IO_2_1_stderr_>)
at iofwrite.c:37
#3 0x00000000004008d8 in main () at main.c:39
(gdb) thread
[Current thread is 3 (Thread 0x7f45ebb6a700 (LWP 4747))]
(gdb) info line main.c:39
Line 39 of "main.c" starts at address 0x4008ba <main+168>
and ends at 0x4008d8 <main+198>.
(gdb) list *(0x4008ba)
0x4008ba is in main (main.c:39).
34 while (1) {
35 /* 等待并检测某些资源就绪 */
36 /* something */
37 /* 解锁告知线程资源就绪 */
38 pthread_mutex_unlock(&lock);
39 fprintf(stderr, "Main Process unlock mutex\n");
40 }
41
42 return 0;
43 }
上文指出:
(9)thread 3 是 4747 线程
(10)thread 3 待执行 main.c:39 的 main 的 fprintf
三个线程的当前动作:
thread 3 刚执行完 pthread_mutex_unlock
thread 2 拿着锁睡着了
thread 1 想要 pthread_mutex_lock 加锁,但是断言失败(mutex->__data.__owner == 0)
//====================================================================================
分析
问题(1)
thread 1 要加的锁跟 thread 2 拿着的锁是否是同一把锁?
(理想代码肯定看代码就知道,但是实际项目中,经常是好几把锁)
从栈帧入手
thread 1
#0 0x00007f45eb3af438 in __GI_raise (sig=sig@entry=6)
at ../sysdeps/unix/sysv/linux/raise.c:54
#1 0x00007f45eb3b103a in __GI_abort () at abort.c:89
#2 0x00007f45eb3a7be7 in __assert_fail_base (fmt=<optimized out>,
assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0",
file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81,
function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:92
#3 0x00007f45eb3a7c92 in __GI___assert_fail (
assertion=assertion@entry=0x7f45eb757015 "mutex->__data.__owner == 0",
file=file@entry=0x7f45eb756ff8 "../nptl/pthread_mutex_lock.c", line=line@entry=81,
function=function@entry=0x7f45eb757180 <__PRETTY_FUNCTION__.8623> "__pthread_mutex_lock") at assert.c:101
#4 0x00007f45eb74df68 in __GI___pthread_mutex_lock (mutex=<optimized out>)
at ../nptl/pthread_mutex_lock.c:81
#5 0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
#6 0x00007f45eb74b6ba in start_thread (arg=0x7f45eab78700) at pthread_create.c:333
#7 0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
函数代码
void * process(void * arg)
{
fprintf(stderr, "Starting process %s\n", (char *) arg);
while (1) {
/* 加锁等待某些资源 */
pthread_mutex_lock(&lock);
fprintf(stderr, "Process %s lock mutex\n", (char *) arg);
/* 加锁成功表示资源就绪 */
usleep(1000);
/* do something */
}
return NULL;
}
thread 1 的 frame 5 是 process 上下文
thread 1 的 frame 4 是 process 调用的 pthread_mutex_lock() 上下文,即 __GI___pthread_mutex_lock 上下文
pthread_mutex_lock() 的形参只有一个 lock
进入 pthread_mutex_lock() 之前,一定会将 &lock 放入寄存器,
进入 __GI___pthread_mutex_lock() 之后,一定会用到 &lock ,所以它的值肯定也会保存在栈上
顺着这个思路:寻找 &lock 的值
打印 main 的汇编
(gdb) frame 6
#6 0x00007f45eb74b6ba in start_thread (arg=0x7f45eab78700) at pthread_create.c:333
333 pthread_create.c: No such file or directory.
(gdb) disas
... ...
End of assembler dump.
打印 process 的汇编
(gdb) frame 5
#5 0x00000000004007e9 in process (arg=0x4009a6) at main.c:13
13 pthread_mutex_lock(&lock);
(gdb) disas
Dump of assembler code for function process:
0x00000000004007b6 <+0>: push %rbp
0x00000000004007b7 <+1>: mov %rsp,%rbp
0x00000000004007ba <+4>: sub $0x10,%rsp
0x00000000004007be <+8>: mov %rdi,-0x8(%rbp)
0x00000000004007c2 <+12>: mov 0x200897(%rip),%rax # 0x601060 <stderr@@GLIBC_2.2.5>
0x00000000004007c9 <+19>: mov -0x8(%rbp),%rdx
0x00000000004007cd <+23>: mov $0x400964,%esi
0x00000000004007d2 <+28>: mov %rax,%rdi
0x00000000004007d5 <+31>: mov $0x0,%eax
0x00000000004007da <+36>: callq 0x400660 <fprintf@plt>
0x00000000004007df <+41>: mov $0x601080,%edi // 【把 0x601080 赋值给 edi 寄存器】
0x00000000004007e4 <+46>: callq 0x400690 <pthread_mutex_lock@plt>
=> 0x00000000004007e9 <+51>: mov 0x200870(%rip),%rax # 0x601060 <stderr@@GLIBC_2.2.5>
0x00000000004007f0 <+58>: mov -0x8(%rbp),%rdx
0x00000000004007f4 <+62>: mov $0x400979,%esi
0x00000000004007f9 <+67>: mov %rax,%rdi
0x00000000004007fc <+70>: mov $0x0,%eax
0x0000000000400801 <+75>: callq 0x400660 <fprintf@plt>
0x0000000000400806 <+80>: mov $0x3e8,%edi
0x000000000040080b <+85>: callq 0x4006a0 <usleep@plt>
0x0000000000400810 <+90>: jmp 0x4007df <process+41>
End of assembler dump.
由上面得出, 0x601080 应该就是 lock 的地址了
用查看内存的方式看下:
(gdb) x 0x601080
0x601080 <lock>: ""
(提前剧透一下,还真是,但这不是分析出来的)
(gdb) p &lock
$8 = (pthread_mutex_t *) 0x601080 <lock>
用 info args 的形式来捞下 pthread_mutex_lock 的实参
(gdb) frame 4
#4 0x00007f45eb74df68 in __GI___pthread_mutex_lock (mutex=<optimized out>)
at ../nptl/pthread_mutex_lock.c:81
81 ../nptl/pthread_mutex_lock.c: No such file or directory.
(gdb) info args
mutex = <optimized out>
捞不到
(gdb) info reg
寄存器名称 16进制值 10进制值
rax 0x0 0
rbx 0x0 0
rcx 0x7f45eb3af438 139938275980344
rdx 0x6 6
rsi 0x128d 4749
rdi 0x128b 4747
rbp 0x7f45eab77f50 0x7f45eab77f50
rsp 0x7f45eab77f40 0x7f45eab77f40
r8 0x7f45e4000a70 139938154678896
r9 0xfefefeff092d6300 -72340172667264256
r10 0x8 8
r11 0x206 518
r12 0x0 0
r13 0x7fff058268ff 140733285820671
r14 0x7f45eab789c0 139938267367872
r15 0x0 0
rip 0x4007e9 0x4007e9 <process+51> // 【待执行的下一条语句】
eflags 0x206 [ PF IF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
看下 thread 2 的 lock
thread 2
#0 0x00007f45eb44638d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007f45eb477e54 in usleep (useconds=<optimized out>) at ../sysdeps/posix/usleep.c:32
#2 0x0000000000400810 in process (arg=0x400990) at main.c:16
#3 0x00007f45eb74b6ba in start_thread (arg=0x7f45eb379700) at pthread_create.c:333
#4 0x00007f45eb4814dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
同样的捞法,是同一把锁
//====================================================================================
查看 lock 出事时的内部信息,发现 __owner 和 __count 都清零了
(gdb) p lock
$7 = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 4294963709,
__kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 12 times>, "\375\361\377\377", '\000' <repeats 23 times>,
__align = 0}
但是 thread 1 在加锁时,却爆出来说: __owner != 0
这个解释还在思考中… …
//====================================================================================
参考博客
https://blog.csdn.net/yxtxiaotian/article/details/78609504#comments_16952964
int __pthread_mutex_lock (mutex) {
assert (sizeof (mutex->__size) >= sizeof (mutex->__data));
unsigned int type = PTHREAD_MUTEX_TYPE (mutex);
if (__builtin_expect (type & ~PTHREAD_MUTEX_KIND_MASK_NP, 0))
return __pthread_mutex_lock_full (mutex);
pid_t id = THREAD_GETMEM (THREAD_SELF, tid);
if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)
== PTHREAD_MUTEX_TIMED_NP) //1---判断锁类型
{
simple:
/* Normal mutex. */
LLL_MUTEX_LOCK (mutex); //2---加锁(原子操作)
assert (mutex->__data.__owner == 0); //3---Owner判断
}
...
/* Record the ownership. */
mutex->__data.__owner = id; //4---Owner赋值
#ifndef NO_INCR
++mutex->__data.__nusers;
#endif
return 0;
}
加锁函数的主要4步操作:
首先会判断锁的类型,这里仅对PTHREAD_MUTEX_TIMED_NP类型的锁做出分析,该该类型的锁为默认的锁类型,
当一个线程加锁后其余请求锁的线程会排入一个等待队列,并在锁解锁后按优先级获得锁。
然后程序调用LLT_MUTEX_LOCK()宏执行底层加锁动作,这个加锁流程是原子的且不同的架构实现并不相同,
然后会判断是否已经有线程获取了该锁(因为PTHREAD_MUTEX_TIMED_NP类型的锁是不允许嵌套加锁的),
若已经有线程获取了锁则出错退出(示例程序中就是在此出错的),
在函数的最后会把当前获得锁的线程号赋给__owner字段(线程与锁绑定)就结束了,
此时当前线程进入临界区,其他对锁请求的线程将阻塞。
int __pthread_mutex_unlock_usercnt (mutex, decr) {
int type = PTHREAD_MUTEX_TYPE (mutex);
if (__builtin_expect (type & ~PTHREAD_MUTEX_KIND_MASK_NP, 0))
return __pthread_mutex_unlock_full (mutex, decr);
if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP) //1---判断锁类型
== PTHREAD_MUTEX_TIMED_NP)
{
/* Always reset the owner field. */
normal:
mutex->__data.__owner = 0; //2---Owner解除
if (decr)
/* One less user. */
--mutex->__data.__nusers;
/* Unlock. */
lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex)); //3---原子解锁
return 0;
}
...
}
解锁函数的3步:
首先依旧是判断锁类型,然后解除锁和线程的绑定关系,
最后就调用lll_unlock()函数原子的解锁,此时若有加锁线程需要获取锁,
相应线程会从LLT_MUTEX_LOCK()函数返回继续执行。
这两个函数的执行并不是原子的,是可能存在上下文切换动作的。
在通常的用法中,加锁操作一般都是为了保护临界资源不被重入改写,
一般都是严格按照“加锁–>写入/读取临界资源–>解锁”的流程执行(由加锁的线程负责解锁),
加锁:
(1) LLL_MUTEX_LOCK
(2) assert (mutex->__data.__owner == 0)
(3) 若为0就执行 mutex->__data.__owner = id;
(4) 若不为0就 EE
解锁
(1) mutex->__data.__owner = 0
(2) lll_unlock
如下三个场景,绝对造成 EE
其实不论上述哪一种同步的情况,其出错的原因有两点:
(1)解了未被上锁的锁;
(2)A线程加的锁由其他线程去解,进一步分析就是没有严格按照“加锁–>解锁”的流程使用mutex锁。
最后对于以上这种“线程间同步”的使用方法可以使用条件变量或者是信号量实现而不要使用mutex锁,mutex锁一般被用在保护线程间临界资源的情况下。
总结:
1、不要去解锁一个未被加锁的mutex锁;
2、不要一个线程中加锁而在另一个线程中解锁;
3、使用mutex锁用于保护临界资源,严格按照“加锁–>写入/读取临界资源–>解锁”的流程执行,对于线程间同步的需求使用条件变量或信号量实现。
//========================================================================
本人实测
(1) A线程加锁,B线程解锁,不会导致 core dump
#include <stdio.h>
#include <unistd.h>
#include "pthread.h"
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
void * process1(void * arg) {
fprintf(stderr, "Starting process %s\n", (char *) arg);
pthread_mutex_lock(&lock);
fprintf(stderr, "Process %s lock mutex\n", (char *) arg);
sleep(10);
fprintf(stderr, "Process %s exit\n", (char *) arg);
return NULL;
}
void * process2(void * arg) {
fprintf(stderr, "Starting process %s\n", (char *) arg);
sleep(4);
pthread_mutex_unlock(&lock);
fprintf(stderr, "Process %s exit\n", (char *) arg);
return NULL;
}
int main(void) {
pthread_t th_a, th_b;
int ret = 0;
void *tmp;
ret = pthread_create(&th_a, NULL, process1, "a");
if (ret != 0) fprintf(stderr, "create a failed %d\n", ret);
ret = pthread_create(&th_b, NULL, process2, "b");
if (ret != 0) fprintf(stderr, "create b failed %d\n", ret);
pthread_join(th_a, &tmp);
pthread_join(th_b, &tmp);
return 0;
}
(2) 加锁一次,释放锁 N 次,不会导致 core dump