pthread_cancel 退出线程引起死锁的问题和解决方法

2024-01-29 18:52:16

Posix的线程终止有两种情况：正常终止和非正常终止。线程主动调用pthread_exit()或者从线程函数中return都将使线程正常退出，这是可预见的退出方式；非正常终止是线程在其他线程的干预下，或者由于自身运行出错（比如访问非法地址）而退出，比如pthreead_cancel，这种退出方式是不可预见的。不论是可预见的线程终止还是异常终止，都会存在资源释放的问题，在不考虑因运行出错而退出的前提下，如何保证线程终止时能顺利的释放掉自己所占用的资源，特别是锁资源，就是一个必须考虑解决的问题。

最经常出现的情形是资源独占锁的使用：线程为了访问临界资源而为其加上锁，但在访问过程中被外界取消，如果线程处于响应取消状态，且采用异步方式响应，或者在打开独占锁以前的运行路径上存在取消点，则该临界资源将永远处于锁定状态得不到释放。外界取消操作是不可预见的，因此的确需要一个机制来简化用于资源释放的编程。

POSIX中的函数cancellation点的:
            pthread_join
            pthread_cond_wait
            thread_cond_timewait
            pthread_testcancel
            sem_wait
            sigwait       都是cancellation点.
            下面的这些系统函数也是cancellation点:
             accept
             fcntl
             open
             read
             write
             lseek
             close
             send
            sendmsg
             sendto
            connect
             recv
            recvfrom
            recvmsg
             system
            tcdrain
             fsync
             msync
             pause
             wait
            waitpid
            nanosleep

当其他线程调用pthreead_cancel都会让本线程在这些函数后退出线程。

默认测试代码如下：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/syscall.h>


pthread_mutex_t mutexA;
int thStop = 0;
int is_safemode = 0;
int is_safe_exit = 0;
int is_setcancle = 0;
int is_notify = 0;

void *thread_function1(void *arg)
{
  pthread_t threadId = 0;
  long int pid = getpid();
  long int lwpId = syscall(SYS_gettid);
  threadId  = (pthread_t)(pthread_self());
  printf("thread[0x%lx][%ld][%ld] in function1\n",threadId,lwpId,pid);

  while(1)
  {
    printf("function1 owner:%ld waiting lock owner:%d ...\n",lwpId,mutexA.__data.__owner);
    pthread_mutex_lock(&mutexA);
    printf("function1 mutex:owner::%d;count::%d;lock:%d\n",
             mutexA.__data.__owner,mutexA.__data.__count,mutexA.__data.__lock);
    printf("I an thread[0x%lx][%ld] function1\n",threadId,lwpId);
    sleep(1);
    pthread_mutex_unlock(&mutexA);
    sleep(1);
  }
}

void clean_function2_res(void *arg)
{
  int lwpid = (int)*((int *)arg);
  if(!is_notify)
  {
   return;
  }
  printf("clean function2 res lwpid:%d\n",lwpid);
  if(mutexA.__data.__owner == lwpid)
  {
    pthread_mutex_unlock(&mutexA);
    printf("clean function2 res lock\n");
   }
}

void *thread_function2(void *arg)
{
  int oldstate = 0;
  int waitCount = 0;
  pthread_t threadId = 0;
  long int pid = getpid();
  int lwpId = syscall(SYS_gettid);
  threadId  = (pthread_t)(pthread_self());
  printf("thread[0x%lx][%d][%ld] in function2\n",threadId,lwpId,pid);
  pthread_cleanup_push(clean_function2_res,(void *)&lwpId);

  while(1)
  {
    printf("function2 owner:%d waiting lock owner:%d ...\n",lwpId,mutexA.__data.__owner);
    pthread_mutex_lock(&mutexA);
    printf("function2 mutex:owner::%d;count::%d;lock:%d\n",
             mutexA.__data.__owner,mutexA.__data.__count,mutexA.__data.__lock);
    if(thStop)
    {
      while(1)
      {
        if((is_safemode) && (is_safe_exit))
        {
          break;
        }
        printf("waiting thread[0x%ld] cancel...\n",threadId);
        usleep(500000);
        if(is_setcancle)
        {
         waitCount ++;
         pthread_setcancelstate(PTHREAD_CANCEL_DISABLE,&oldstate);
         printf("pthread cancel oldstatue:%d;[%d]:[%d]\n",oldstate,PTHREAD_CANCEL_DISABLE,PTHREAD_CANCEL_ENABLE);
         if(waitCount > 10)
         {
          printf("it will into cancel pthread point\n");
          pthread_mutex_unlock(&mutexA);
          sleep(1);
          pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,NULL);
          //printf("waiting cancel point sleep\n");
          //usleep(500000);
          printf("waiting cancel testcancel point\n");
          pthread_testcancel();
          printf("test cancel point\n");
          while(1)
          {
            printf("waiting cancel pthread...\n");
            usleep(500000);
          }
         }
        }
      }
    }
    else
    {
     printf("I an thread[0x%lx][%d] function2\n",threadId,lwpId);
     sleep(1);
    }
    pthread_mutex_unlock(&mutexA);
    sleep(1);
    if((is_safemode) && (is_safe_exit))
    {
     break;
    }
  }


  if(is_safemode)
  {
   printf("exit pthread by safe mode\n");
   pthread_exit(NULL);
  }

 pthread_cleanup_pop(0);

}

int main(int avgc,char **pp_argv)
{
  pthread_t mthid = -1;
  unsigned int count = 0;
  int ret = -1;
  int mode = 0;

  if(avgc >= 2)
   {
    mode = atoi(pp_argv[1]);
   }

   switch(mode)
   {
     case 1:
     is_notify = 1;
     break;
     case 2:
     is_safemode = 1;
     break;
     case 3:
     is_setcancle = 1;
     break;
     case 0:
     default:
     break;
   }

  printf("notify clean mode:%d\n",is_notify);
  printf("safe mode:%d\n",is_safemode);
  printf("set cancle mode:%d\n",is_setcancle);


  is_safe_exit = 0;
  thStop = 0;
  pthread_mutex_init(&mutexA, NULL);

  pthread_create(&mthid,NULL,thread_function1,NULL);
  printf("create thread:0x%lx\n",mthid);

  pthread_create(&mthid,NULL,thread_function2,NULL);
  printf("create thread:0x%lx\n",mthid);

  do{
    sleep(1);
    count ++;
    printf("main thread count:%d...\n",count);
   }while(count < 10);

  thStop = 1;
  sleep(3);

  if(is_safemode)
  {
    is_safe_exit = 1;
  }
  else
 {
  pthread_cancel(mthid);
 }

  pthread_join(mthid,(void *)&ret);

  while(1)
  {
   printf("main thread function...\n");
   sleep(1);
  }

  pthread_mutex_destroy(&mutexA);

}

编译：gcc -g mylock.c -lpthread -o mylock

复现问题：./mylock 0 强制进入死锁环境；

主线程调用thStop = 1;让thread_function2进入lock状态，然后调用pthread_cancel(mthid);终止线程thread_function2 ，thread_function1因为thread_function2 的退出没有是否互斥锁导致无法获取互斥锁导致死锁停止运行；

解决方案1,注册线程清理回调

void pthread_cleanup_push(void (*routine) (void *), void *arg)
void pthread_cleanup_pop(int execute)

pthread_cleanup_push()/pthread_cleanup_pop()采用先入后出的栈结构管理，void routine(void *arg)函数在调用pthread_cleanup_push()时压入清理函数栈，多次对pthread_cleanup_push() 的调用将在清理函数栈中形成一个函数链；从pthread_cleanup_push的调用点到pthread_cleanup_pop之间的程序段中的终止动作（包括调用pthread_exit()、pthread_cancel和异常终止，不包括return）都将执行pthread_cleanup_push()所指定的清理函数。

运行结果参考 ./mylock 1

解决方案2，线程安全退出，外部线程不要采用pthread_cancel结束线程，而是采用通知方法，由本线程接受到消息或参数后释放资源安全退出，

运行结果参考 ./mylock 2

解决方案3，在安全公共资源取消线程对pthread_cancel的响应。

设置本线程对Cancel信号的反应，state有两种值：PTHREAD_CANCEL_ENABLE（缺省）和 PTHREAD_CANCEL_DISABLE，分别表示收到信号后设为CANCLED状态和忽略CANCEL信号继续运行；old_state如果不为 NULL则存入原来的Cancel状态以便恢复。

pthread_setcancelstate(PTHREAD_CANCEL_DISABLE,&oldstate);

/***free resource安全执行完代码***/

pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,NULL);

设置取消点 pthread_testcancel，

运行结果参考 ./mylock 3

码农公寓

相关文章