在研究此问题时,我遇到了使用ptrace的可能想法,但是我无法正确理解ptrace与线程的交互方式。
假设我有一个给定的多线程主进程,并且我想附加到其中的特定线程(可能来自派生的子进程)。
-
我可以附加到特定线程吗? (有关此问题的手册有所不同。)
-
如果是这样,是否意味着单步执行仅一步步执行该线程的指令?它会停止所有进程的线程吗?
-
如果是这样,我调用PTRACE_SYSCALL或PTRACE_SINGLESTEP时所有其他线程是否仍保持停止状态,还是所有线程都继续?有没有一种方法可以只在一个线程中前进,但要确保其他线程保持停止状态?
基本上,我想通过强制所有线程停止来同步原始程序,然后仅通过单步执行一个被跟踪的线程来执行一小套单线程指令。
到目前为止,我的个人尝试看起来像这样:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
pid_t target = syscall(SYS_gettid); // get the calling thread's ID pid_t pid = fork();
if (pid > 0) { waitpid(pid, NULL, 0); // synchronise main process
important_instruction(); } else if (pid == 0) { ptrace(target, PTRACE_ATTACH, NULL, NULL); // does this work?
// cancel parent's"waitpid" call, e.g. with a signal
// single-step to execute"important_instruction()" above
ptrace(target, PTRACE_DETACH, NULL, NULL); // parent's threads resume?
_Exit(0); } |
但是,我不确定,也找不到合适的引用,因为这是并发正确的,并且保证important_instruction()仅在所有其他线程停止时才执行。我也了解,当父母从其他地方接收到信号时,可能会出现竞争状况,而且我听说应该改用PTRACE_SEIZE,但这似乎并不存在。
任何澄清或参考将不胜感激!
我写了第二个测试用例。我不得不添加一个单独的答案,因为它太长了,无法放入包含示例输出的第一个答案。
好。
首先,这是tracer.c:
好。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
#include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/prctl.h> #include <sys/wait.h> #include <sys/user.h> #include <dirent.h> #include <string.h> #include <signal.h> #include <errno.h> #include <stdio.h> #ifndef SINGLESTEPS #define SINGLESTEPS 10 #endif
/* Similar to getline(), except gets process pid task IDs. * Returns positive (number of TIDs in list) if success, * otherwise 0 with errno set. */ size_t get_tids(pid_t **const listptr, size_t *const sizeptr, const pid_t pid) { char dirname[64]; DIR *dir; pid_t *list; size_t size, used = 0;
if (!listptr || !sizeptr || pid < (pid_t)1) { errno = EINVAL; return (size_t)0; }
if (*sizeptr > 0) { list = *listptr; size = *sizeptr; } else { list = *listptr = NULL; size = *sizeptr = 0; }
if (snprintf(dirname, sizeof dirname,"/proc/%d/task/", (int)pid) >= (int)sizeof dirname) { errno = ENOTSUP; return (size_t)0; }
dir = opendir(dirname); if (!dir) { errno = ESRCH; return (size_t)0; }
while (1) { struct dirent *ent; int value; char dummy;
errno = 0; ent = readdir(dir); if (!ent) break;
/* Parse TIDs. Ignore non-numeric entries. */ if (sscanf(ent->d_name,"%d%c", &value, &dummy) != 1) continue;
/* Ignore obviously invalid entries. */ if (value < 1) continue;
/* Make sure there is room for another TID. */ if (used >= size) { size = (used | 127) + 128; list = realloc(list, size * sizeof list[0]); if (!list) { closedir(dir); errno = ENOMEM; return (size_t)0; } *listptr = list; *sizeptr = size; }
/* Add to list. */ list[used++] = (pid_t)value; } if (errno) { const int saved_errno = errno; closedir(dir); errno = saved_errno; return (size_t)0; } if (closedir(dir)) { errno = EIO; return (size_t)0; }
/* None? */ if (used < 1) { errno = ESRCH; return (size_t)0; }
/* Make sure there is room for a terminating (pid_t)0. */ if (used >= size) { size = used + 1; list = realloc(list, size * sizeof list[0]); if (!list) { errno = ENOMEM; return (size_t)0; } *listptr = list; *sizeptr = size; }
/* Terminate list; done. */ list[used] = (pid_t)0; errno = 0; return used; }
static int wait_process(const pid_t pid, int *const statusptr) { int status; pid_t p;
do { status = 0; p = waitpid(pid, &status, WUNTRACED | WCONTINUED); } while (p == (pid_t)-1 && errno == EINTR); if (p != pid) return errno = ESRCH;
if (statusptr) *statusptr = status;
return errno = 0; }
static int continue_process(const pid_t pid, int *const statusptr) { int status; pid_t p;
do {
if (kill(pid, SIGCONT) == -1) return errno = ESRCH;
do { status = 0; p = waitpid(pid, &status, WUNTRACED | WCONTINUED); } while (p == (pid_t)-1 && errno == EINTR);
if (p != pid) return errno = ESRCH;
} while (WIFSTOPPED(status));
if (statusptr) *statusptr = status;
return errno = 0; }
void show_registers(FILE *const out, pid_t tid, const char *const note) { struct user_regs_struct regs; long r;
do { r = ptrace(PTRACE_GETREGS, tid, ®s, ®s); } while (r == -1L && errno == ESRCH); if (r == -1L) return;
#if (defined(__x86_64__) || defined(__i386__)) && __WORDSIZE == 64 if (note && *note) fprintf(out,"Task %d: RIP=0x%016lx, RSP=0x%016lx. %s ", (int)tid, regs.rip, regs.rsp, note); else fprintf(out,"Task %d: RIP=0x%016lx, RSP=0x%016lx. ", (int)tid, regs.rip, regs.rsp); #elif (defined(__x86_64__) || defined(__i386__)) && __WORDSIZE == 32 if (note && *note) fprintf(out,"Task %d: EIP=0x%08lx, ESP=0x%08lx. %s ", (int)tid, regs.eip, regs.esp, note); else fprintf(out,"Task %d: EIP=0x%08lx, ESP=0x%08lx. ", (int)tid, regs.eip, regs.esp); #endif }
int main(int argc, char *argv[]) { pid_t *tid = 0; size_t tids = 0; size_t tids_max = 0; size_t t, s; long r;
pid_t child; int status;
if (argc < 2 || !strcmp(argv[1],"-h") || !strcmp(argv[1],"--help")) { fprintf(stderr," "); fprintf(stderr,"Usage: %s [ -h | --help ] ", argv[0]); fprintf(stderr," %s COMMAND [ ARGS ... ] ", argv[0]); fprintf(stderr," "); fprintf(stderr,"This program executes COMMAND in a child process, "); fprintf(stderr,"and waits for it to stop (via a SIGSTOP signal). "); fprintf(stderr,"When that occurs, the register state of each thread "); fprintf(stderr,"is dumped to standard output, then the child process "); fprintf(stderr,"is sent a SIGCONT signal. "); fprintf(stderr," "); return 1; }
child = fork(); if (child == (pid_t)-1) { fprintf(stderr,"fork() failed: %s. ", strerror(errno)); return 1; }
if (!child) { prctl(PR_SET_DUMPABLE, (long)1); prctl(PR_SET_PTRACER, (long)getppid()); fflush(stdout); fflush(stderr); execvp(argv[1], argv + 1); fprintf(stderr,"%s: %s. ", argv[1], strerror(errno)); return 127; }
fprintf(stderr,"Tracer: Waiting for child (pid %d) events.
", (int)child); fflush(stderr);
while (1) {
/* Wait for a child event. */ if (wait_process(child, &status)) break;
/* Exited? */ if (WIFEXITED(status) || WIFSIGNALED(status)) { errno = 0; break; }
/* At this point, only stopped events are interesting. */ if (!WIFSTOPPED(status)) continue;
/* Obtain task IDs. */ tids = get_tids(&tid, &tids_max, child); if (!tids) break;
printf("Process %d has %d tasks,", (int)child, (int)tids); fflush(stdout);
/* Attach to all tasks. */ for (t = 0; t < tids; t++) { do { r = ptrace(PTRACE_ATTACH, tid[t], (void *)0, (void *)0); } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH)); if (r == -1L) { const int saved_errno = errno; while (t-->0) do { r = ptrace(PTRACE_DETACH, tid[t], (void *)0, (void *)0); } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH)); tids = 0; errno = saved_errno; break; } } if (!tids) { const int saved_errno = errno; if (continue_process(child, &status)) break; printf(" failed to attach (%s). ", strerror(saved_errno)); fflush(stdout); if (WIFCONTINUED(status)) continue; errno = 0; break; }
printf(" attached to all.
"); fflush(stdout);
/* Dump the registers of each task. */ for (t = 0; t < tids; t++) show_registers(stdout, tid[t],""); printf(" "); fflush(stdout);
for (s = 0; s < SINGLESTEPS; s++) { do { r = ptrace(PTRACE_SINGLESTEP, tid[tids-1], (void *)0, (void *)0); } while (r == -1L && errno == ESRCH); if (!r) { for (t = 0; t < tids - 1; t++) show_registers(stdout, tid[t],""); show_registers(stdout, tid[tids-1],"Advanced by one step."); printf(" "); fflush(stdout); } else { fprintf(stderr,"Single-step failed: %s. ", strerror(errno)); fflush(stderr); } }
/* Detach from all tasks. */ for (t = 0; t < tids; t++) do { r = ptrace(PTRACE_DETACH, tid[t], (void *)0, (void *)0); } while (r == -1 && (errno == EBUSY || errno == EFAULT || errno == ESRCH)); tids = 0; if (continue_process(child, &status)) break; if (WIFCONTINUED(status)) { printf("Detached. Waiting for new stop events.
"); fflush(stdout); continue; } errno = 0; break; } if (errno) fprintf(stderr,"Tracer: Child lost (%s) ", strerror(errno)); else if (WIFEXITED(status)) fprintf(stderr,"Tracer: Child exited (%d) ", WEXITSTATUS(status)); else if (WIFSIGNALED(status)) fprintf(stderr,"Tracer: Child died from signal %d ", WTERMSIG(status)); else fprintf(stderr,"Tracer: Child vanished "); fflush(stderr);
return status; } |
tracer.c执行指定的命令,等待命令接收到SIGSTOP信号。 (tracer.c本身不会发送;您可以让跟踪停止自身,也可以从外部发送信号。)
好。
命令停止后,tracer.c将ptrace附加到每个线程,并以固定步数(SINGLESTEPS编译时常数)单步执行其中一个线程,显示每个线程的相关寄存器状态。
好。
之后,它将与命令分离,并向其发送SIGCONT信号以使其继续正常运行。
好。
这是我用于测试的简单测试程序worker.c:
好。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
#include <pthread.h> #include <signal.h> #include <string.h> #include <errno.h> #include <stdio.h>
#ifndef THREADS #define THREADS 2 #endif
volatile sig_atomic_t done = 0;
void catch_done(int signum) { done = signum; }
int install_done(const int signum) { struct sigaction act;
sigemptyset(&act.sa_mask); act.sa_handler = catch_done; act.sa_flags = 0; if (sigaction(signum, &act, NULL)) return errno; else return 0; }
void *worker(void *data) { volatile unsigned long *const counter = data;
while (!done) __sync_add_and_fetch(counter, 1UL);
return (void *)(unsigned long)__sync_or_and_fetch(counter, 0UL); }
int main(void) { unsigned long counter = 0UL; pthread_t thread[THREADS]; pthread_attr_t attrs; size_t i;
if (install_done(SIGHUP) || install_done(SIGTERM) || install_done(SIGUSR1)) { fprintf(stderr,"Worker: Cannot install signal handlers: %s. ", strerror(errno)); return 1; }
pthread_attr_init(&attrs); pthread_attr_setstacksize(&attrs, 65536); for (i = 0; i < THREADS; i++) if (pthread_create(&thread[i], &attrs, worker, &counter)) { done = 1; fprintf(stderr,"Worker: Cannot create thread: %s. ", strerror(errno)); return 1; } pthread_attr_destroy(&attrs);
/* Let the original thread also do the worker dance. */ worker(&counter);
for (i = 0; i < THREADS; i++) pthread_join(thread[i], NULL);
return 0; } |
编译都使用例如
好。
1 2 |
gcc -W -Wall -O3 -fomit-frame-pointer worker.c -pthread -o worker gcc -W -Wall -O3 -fomit-frame-pointer tracer.c -o tracer |
并在单独的终端或后台运行,例如
好。
跟踪器显示工作者的PID:
好。
1 |
Tracer: Waiting for child (pid 24275) events. |
此时,孩子正在正常运行。当您向孩子发送SIGSTOP时,动作开始。跟踪器检测到它,进行所需的跟踪,然后分离并让孩子正常继续:
好。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
kill -STOP 24275
Process 24275 has 3 tasks, attached to all.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a65, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a58, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a65, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a58, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.
Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428. Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8. Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.
Detached. Waiting for new stop events. |
您可以根据需要多次重复上述操作。请注意,我选择了SIGSTOP信号作为触发器,因为tracer.c这样也可以用作根据每个请求生成复杂的多线程核心转储的基础(因为多线程进程可以通过向自身发送SIGSTOP来简单地触发它) 。
好。
在上面的示例中,worker()函数的反汇编使线程全部旋转:
好。
1 2 3 4 5 6 7 8 9 10 11 12 |
0x400a50: eb 0b jmp 0x400a5d 0x400a52: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 0x400a58: f0 48 83 07 01 lock addq $0x1,(%rdi) = fourth step 0x400a5d: 8b 05 00 00 00 00 mov 0x0(%rip),%eax = first step 0x400a63: 85 c0 test %eax,%eax = second step 0x400a65: 74 f1 je 0x400a58 = third step 0x400a67: 48 8b 07 mov (%rdi),%rax 0x400a6a: 48 89 c2 mov %rax,%rdx 0x400a6d: f0 48 0f b1 07 lock cmpxchg %rax,(%rdi) 0x400a72: 75 f6 jne 0x400a6a 0x400a74: 48 89 d0 mov %rdx,%rax 0x400a77: c3 retq |
现在,该测试程序仅显示了如何停止进程,将其附加到其所有线程,单步执行所需数量的指令,然后让所有线程正常继续;它还不能证明同样适用于让特定线程正常继续运行(通过PTRACE_CONT)。但是,我在下面描述的细节向我表明,对于PTRACE_CONT,相同的方法应该可以正常工作。
好。
我在编写上述测试程序时遇到的主要问题或惊奇是:
好。
1 2 3 4 5 |
long r;
do { r = ptrace(PTRACE_cmd, tid, ...); } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH)); |
循环,特别是对于ESRCH情况(由于ptrace手册页描述而仅添加了其他情况)。
好。
您会看到,大多数ptrace命令仅在任务停止时才被允许。但是,任务仍在完成时(例如,单步命令。因此,使用上述循环-可能添加一毫秒的nanosleep或类似操作以避免浪费CPU-确保在尝试提供新的ptrace命令之前,该命令已经完成(因此任务已停止)。
好。
Kerrek SB,我相信至少您在测试程序中遇到的一些麻烦是由于此问题引起的吗?对我个人而言,这是一种D'oh!意识到这一点当然是必要的,因为追踪本质上是异步的,而不是同步的。
好。
(这种异步性也是我上面提到的SIGCONT-PTRACE_CONT交互的原因。我确实相信,使用上述循环正确处理后,交互不再是问题,并且实际上是可以理解的。)
好。
在此答案的注释中添加:
好。
Linux内核在task_struct结构中使用一组任务状态标志(有关定义,请参见include/linux/sched.h)来跟踪每个任务的状态。 ptrace()的面向用户空间的一面在kernel/ptrace.c中定义。
好。
调用PTRACE_SINGLESTEP或PTRACE_CONT时,kernel/ptrace.c:ptrace_continue()处理大多数细节。通过调用wake_up_state(child, __TASK_TRACED)(kernel/sched/core.c::try_to_wake_up(child, __TASK_TRACED, 0))完成。
好。
通过SIGSTOP信号停止进程时,所有任务将停止,并最终处于"已停止,未跟踪"状态。
好。
附加到每个任务(通过PTRACE_ATTACH或PTRACE_SEIZE,请参见kernel/ptrace.c:ptrace_attach())可以修改任务状态。但是,ptrace状态位(请参见include/linux/ptrace.h:PT_常量)与任务可运行状态位(请参见include/linux/sched.h:TASK_常量)是分开的。
好。
附加到任务并向进程发送SIGCONT信号后,停止状态不会立即被修改(我相信),因为也正在跟踪任务。执行PTRACE_SINGLESTEP或PTRACE_CONT的结果以kernel/sched/core.c::try_to_wake_up(child, __TASK_TRACED, 0)结尾,这将更新任务状态,并将任务移至运行队列。
好。
现在,我尚未找到代码路径的复杂部分是下次计划任务时如何在内核中更新任务状态。我的测试表明,通过单步执行(这是另一个任务状态标志),只有任务状态被更新,并且清除了单步标志。看来PTRACE_CONT并不那么可靠。我相信这是因为单步标记"强制"了任务状态的改变。也许有一个"竞赛条件"。继续传递信号和改变状态?
好。
(进一步编辑:内核开发人员肯定希望调用wait(),例如,请参见此线程。)
好。
换句话说,在注意到该进程已停止之后(请注意,如果该进程不是子进程并且尚未附加,则可以使用/proc/PID/stat或/proc/PID/status),我相信以下过程是最可靠的过程:
好。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
pid_t pid, p; /* Process owning the tasks */ tid_t *tid; /* Task ID array */ size_t tids; /* Tasks */ long result; int status; size_t i;
for (i = 0; i < tids; i++) { while (1) { result = ptrace(PTRACE_ATTACH, tid[i], (void *)0, (void *)0); if (result == -1L && (errno == ESRCH || errno == EBUSY || errno == EFAULT || errno == EIO)) { /* To avoid burning up CPU for nothing: */ sched_yield(); /* or nanosleep(), or usleep() */ continue; } break; } if (result == -1L) { /* * Fatal error. First detach from tid[0..i-1], then exit. */ } }
/* Send SIGCONT to the process. */ if (kill(pid, SIGCONT)) { /* * Fatal error, see errno. Exit. */ }
/* Since we are attached to the process, * we can wait() on it. */ while (1) { errno = 0; status = 0; p = waitpid(pid, &status, WCONTINUED); if (p == (pid_t)-1) { if (errno == EINTR) continue; else break; } else if (p != pid) { errno = ESRCH; break; } else if (WIFCONTINUED(status)) { errno = 0; break; } } if (errno) { /* * Fatal error. First detach from tid[0..tids-1], then exit. */ }
/* Single-step each task to update the task states. */ for (i = 0; i < tids; i++) { while (1) { result = ptrace(PTRACE_SINGLESTEP, tid[i], (void *)0, (void *)0); if (result == -1L && errno == ESRCH) { /* To avoid burning up CPU for nothing: */ sched_yield(); /* or nanosleep(), or usleep() */ continue; } break; } if (result == -1L) { /* * Fatal error. First detach from tid[0..i-1], then exit. */ } }
/* Obtain task register structures, to make sure the single-steps * have completed and their states have stabilized. */ for (i = 0; i < tids; i++) { struct user_regs_struct regs;
while (1) { result = ptrace(PTRACE_GETREGS, tid[i], ®s, ®s); if (result == -1L && (errno == ESRCH || errno == EBUSY || errno == EFAULT || errno == EIO)) { /* To avoid burning up CPU for nothing: */ sched_yield(); /* or nanosleep(), or usleep() */ continue; } break; } if (result == -1L) { /* * Fatal error. First detach from tid[0..i-1], then exit. */ } } |
完成上述操作后,所有任务都应附加并处于预期状态,以便例如PTRACE_CONT无需任何技巧就可以工作。
好。
如果行为在将来的内核中发生变化-我确实相信STOP / CONT信号与跟踪之间的相互作用可能会发生变化; 至少应该向LKML开发人员提出有关此行为的问题! -,以上步骤仍然可以正常进行。 (谨慎起见,通过多次使用PTRACE_SINGLESTEP循环可能也是个好主意。)
好。
与PTRACE_CONT的区别在于,如果将来行为发生变化,则初始PTRACE_CONT可能实际上会继续执行该过程,从而导致其后的ptrace()失败。 使用PTRACE_SINGLESTEP,该过程将停止,从而允许进一步的ptrace()调用成功。
好。
有什么问题吗
好。
好。
相关讨论
Can I attach to a specific thread?
是的,至少在当前内核上是如此。
Does that mean that single-stepping only steps through that one thread's instructions? Does it stop all the process's threads?
是。它不会停止其他线程,只会停止附加的线程。
Is there a way to step forward only in one single thread but guarantee that the other threads remain stopped?
是。将SIGSTOP发送到进程(使用waitpid(PID,,WUNTRACED)等待进程停止),然后将PTRACE_ATTACH发送到进程中的每个线程。发送SIGCONT(使用waitpid(PID,,WCONTINUED)等待该过程继续)。
由于在连接时所有线程均已停止,并且连接停止了线程,因此在传递SIGCONT信号后,所有线程均保持停止状态。您可以按照自己喜欢的任何顺序单步执行线程。
我发现这很有趣,足以激发一个测试案例。 (好吧,实际上我怀疑无论如何都不会相信我的话,所以我决定最好证明自己可以复制自己。)
我的系统似乎遵循Linux手册页项目中所述的man 2 ptrace,而Kerrisk似乎非常擅长保持它们与内核行为同步。总的来说,我更喜欢kernel.org的wrt。 Linux内核的其他来源。
摘要:
-
附加到进程本身(TID == PID)只会停止原始线程,而不会停止所有线程。
-
附加到特定线程(使用/proc/PID/task/中的TID)不会停止该线程。 (换句话说,TID == PID的线程并不特殊。)
-
发送SIGSTOP到进程将停止所有线程,但是ptrace()仍然可以正常工作。
-
如果您向该进程发送了SIGSTOP,请在分离前不要调用ptrace(PTRACE_CONT, TID)。 PTRACE_CONT似乎会干扰SIGCONT信号。
您可以先发送SIGSTOP,然后发送PTRACE_ATTACH,然后发送SIGCONT,没有任何问题。线程将保持停止状态(由于ptrace)。换句话说,PTRACE_ATTACH和PTRACE_DETACH与SIGSTOP和SIGCONT混合得很好,没有任何副作用。
-
即使您尝试使用tgkill()(或pthread_kill())将信号发送到特定线程,SIGSTOP和SIGCONT也会影响整个过程。
-
要停止并继续某个特定线程,请PTHREAD_ATTACH;要停止和继续某个进程的所有线程,分别向该进程发送SIGSTOP和SIGCONT信号。
就我个人而言,我相信这可以验证我在另一个问题中建议的方法。
这是您可以编译运行的难看的测试代码,traces.c:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 |
#define GNU_SOURCE #include <stdlib.h> #include <unistd.h> #include <sys/wait.h> #include <sys/ptrace.h> #include <sys/syscall.h> #include <dirent.h> #include <pthread.h> #include <signal.h> #include <string.h> #include <errno.h> #include <stdio.h>
#ifndef THREADS #define THREADS 3 #endif
static int tgkill(int tgid, int tid, int sig) { int retval;
retval = syscall(SYS_tgkill, tgid, tid, sig); if (retval < 0) { errno = -retval; return -1; }
return 0; }
volatile unsigned long counter[THREADS + 1] = { 0UL };
volatile sig_atomic_t run = 0; volatile sig_atomic_t done = 0;
void handle_done(int signum) { done = signum; }
int install_done(int signum) { struct sigaction act; sigemptyset(&act.sa_mask); act.sa_handler = handle_done; act.sa_flags = 0; if (sigaction(signum, &act, NULL)) return errno; return 0; }
void *worker(void *data) { volatile unsigned long *const counter = data;
while (!run) ;
while (!done) (*counter)++;
return (void *)(*counter); }
pid_t *gettids(const pid_t pid, size_t *const countptr) { char dirbuf[128]; DIR *dir; struct dirent *ent;
pid_t *data = NULL, *temp; size_t size = 0; size_t used = 0;
int tid; char dummy;
if ((int)pid < 2) { errno = EINVAL; return NULL; }
if (snprintf(dirbuf, sizeof dirbuf,"/proc/%d/task/", (int)pid) >= (int)sizeof dirbuf) { errno = ENAMETOOLONG; return NULL; }
dir = opendir(dirbuf); if (!dir) return NULL;
while (1) { errno = 0; ent = readdir(dir); if (!ent) break;
if (sscanf(ent->d_name,"%d%c", &tid, &dummy) != 1) continue;
if (tid < 2) continue;
if (used >= size) { size = (used | 127) + 129; temp = realloc(data, size * sizeof data[0]); if (!temp) { free(data); closedir(dir); errno = ENOMEM; return NULL; } data = temp; }
data[used++] = (pid_t)tid; } if (errno) { free(data); closedir(dir); errno = EIO; return NULL; } if (closedir(dir)) { free(data); errno = EIO; return NULL; }
if (used < 1) { free(data); errno = ENOENT; return NULL; }
size = used + 1; temp = realloc(data, size * sizeof data[0]); if (!temp) { free(data); errno = ENOMEM; return NULL; } data = temp;
data[used] = (pid_t)0;
if (countptr) *countptr = used;
errno = 0; return data; }
int child_main(void) { pthread_t id[THREADS]; int i;
if (install_done(SIGUSR1)) { fprintf(stderr,"Cannot set SIGUSR1 signal handler. "); return 1; }
for (i = 0; i < THREADS; i++) if (pthread_create(&id[i], NULL, worker, (void *)&counter[i])) { fprintf(stderr,"Cannot create thread %d of %d: %s. ", i + 1, THREADS, strerror(errno)); return 1; }
run = 1;
kill(getppid(), SIGUSR1);
while (!done) counter[THREADS]++;
for (i = 0; i < THREADS; i++) pthread_join(id[i], NULL);
printf("Final counters: "); for (i = 0; i < THREADS; i++) printf("\tThread %d: %lu ", i + 1, counter[i]); printf("\tMain thread: %lu ", counter[THREADS]);
return 0; }
int main(void) { pid_t *tid = NULL; size_t tids = 0; int i, k; pid_t child, p;
if (install_done(SIGUSR1)) { fprintf(stderr,"Cannot set SIGUSR1 signal handler. "); return 1; }
child = fork(); if (!child) return child_main();
if (child == (pid_t)-1) { fprintf(stderr,"Cannot fork. "); return 1; }
while (!done) usleep(1000);
tid = gettids(child, &tids); if (!tid) { fprintf(stderr,"gettids(): %s. ", strerror(errno)); kill(child, SIGUSR1); return 1; }
fprintf(stderr,"Child process %d has %d tasks. ", (int)child, (int)tids); fflush(stderr);
for (k = 0; k < (int)tids; k++) { const pid_t t = tid[k];
if (ptrace(PTRACE_ATTACH, t, (void *)0L, (void *)0L)) { fprintf(stderr,"Cannot attach to TID %d: %s. ", (int)t, strerror(errno)); kill(child, SIGUSR1); return 1; }
fprintf(stderr,"Attached to TID %d.
", (int)t);
fprintf(stderr,"Peeking the counters in the child process: "); for (i = 0; i <= THREADS; i++) { long v; do { errno = 0; v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL); } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH)); fprintf(stderr,"\tcounter[%d] = %lu ", i, (unsigned long)v); } fprintf(stderr,"Waiting a short moment ..."); fflush(stderr);
usleep(250000);
fprintf(stderr,"and another peek: "); for (i = 0; i <= THREADS; i++) { long v; do { errno = 0; v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL); } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH)); fprintf(stderr,"\tcounter[%d] = %lu ", i, (unsigned long)v); } fprintf(stderr," "); fflush(stderr);
usleep(250000);
ptrace(PTRACE_DETACH, t, (void *)0L, (void *)0L); }
for (k = 0; k < 4; k++) { const pid_t t = tid[tids / 2];
if (k == 0) { fprintf(stderr,"Sending SIGSTOP to child process ..."); fflush(stderr); kill(child, SIGSTOP); } else if (k == 1) { fprintf(stderr,"Sending SIGCONT to child process ..."); fflush(stderr); kill(child, SIGCONT); } else if (k == 2) { fprintf(stderr,"Sending SIGSTOP to TID %d ...", (int)tid[0]); fflush(stderr); tgkill(child, tid[0], SIGSTOP); } else if (k == 3) { fprintf(stderr,"Sending SIGCONT to TID %d ...", (int)tid[0]); fflush(stderr); tgkill(child, tid[0], SIGCONT); } usleep(250000); fprintf(stderr,"done. "); fflush(stderr);
if (ptrace(PTRACE_ATTACH, t, (void *)0L, (void *)0L)) { fprintf(stderr,"Cannot attach to TID %d: %s. ", (int)t, strerror(errno)); kill(child, SIGUSR1); return 1; }
fprintf(stderr,"Attached to TID %d.
", (int)t);
fprintf(stderr,"Peeking the counters in the child process: "); for (i = 0; i <= THREADS; i++) { long v; do { errno = 0; v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL); } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH)); fprintf(stderr,"\tcounter[%d] = %lu ", i, (unsigned long)v); } fprintf(stderr,"Waiting a short moment ..."); fflush(stderr);
usleep(250000);
fprintf(stderr,"and another peek: "); for (i = 0; i <= THREADS; i++) { long v; do { errno = 0; v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL); } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH)); fprintf(stderr,"\tcounter[%d] = %lu ", i, (unsigned long)v); } fprintf(stderr," "); fflush(stderr);
usleep(250000);
ptrace(PTRACE_DETACH, t, (void *)0L, (void *)0L); }
kill(child, SIGUSR1);
do { p = waitpid(child, NULL, 0); if (p == -1 && errno != EINTR) break; } while (p != child);
return 0; } |
使用例如编译并运行
1 2 |
gcc -DTHREADS=3 -W -Wall -O3 traces.c -pthread -o traces ./traces |
输出是子进程计数器的转储(每个计数器在单独的线程中递增,包括使用最终计数器的原始线程)。在短暂的等待时间内比较计数器。例如:
1 2 3 4 5 6 7 8 9 10 11 12 13 |
Child process 18514 has 4 tasks. Attached to TID 18514.
Peeking the counters in the child process: counter[0] = 0 counter[1] = 0 counter[2] = 0 counter[3] = 0 Waiting a short moment ... and another peek: counter[0] = 18771865 counter[1] = 6435067 counter[2] = 54247679 counter[3] = 0 |
如上所示,只有使用最终计数器的初始线程(其TID == PID)停止了。其他三个线程也是如此,它们按顺序使用前三个计数器:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
Attached to TID 18515.
Peeking the counters in the child process: counter[0] = 25385151 counter[1] = 13459822 counter[2] = 103763861 counter[3] = 560872 Waiting a short moment ... and another peek: counter[0] = 25385151 counter[1] = 69116275 counter[2] = 120500164 counter[3] = 9027691
Attached to TID 18516.
Peeking the counters in the child process: counter[0] = 25397582 counter[1] = 105905400 counter[2] = 155895025 counter[3] = 17306682 Waiting a short moment ... and another peek: counter[0] = 32358651 counter[1] = 105905400 counter[2] = 199601078 counter[3] = 25023231
Attached to TID 18517.
Peeking the counters in the child process: counter[0] = 40600813 counter[1] = 111675002 counter[2] = 235428637 counter[3] = 32298929 Waiting a short moment ... and another peek: counter[0] = 48727731 counter[1] = 143870702 counter[2] = 235428637 counter[3] = 39966259 |
接下来的两种情况检查SIGCONT / SIGSTOP wrt。整个过程:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
Sending SIGSTOP to child process ... done. Attached to TID 18516.
Peeking the counters in the child process: counter[0] = 56887263 counter[1] = 170646440 counter[2] = 235452621 counter[3] = 48077803 Waiting a short moment ... and another peek: counter[0] = 56887263 counter[1] = 170646440 counter[2] = 235452621 counter[3] = 48077803
Sending SIGCONT to child process ... done. Attached to TID 18516.
Peeking the counters in the child process: counter[0] = 64536344 counter[1] = 182359343 counter[2] = 253660731 counter[3] = 56422231 Waiting a short moment ... and another peek: counter[0] = 72029244 counter[1] = 182359343 counter[2] = 288014365 counter[3] = 63797618 |
如您所见,发送SIGSTOP将停止所有线程,但不会妨碍ptrace()。同样,在SIGCONT之后,线程继续正常运行。
最后两个案例检查了使用tgkill()将SIGSTOP / SIGCONT发送到特定线程(与第一个计数器对应的线程)并附加到另一个线程时的效果:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
Sending SIGSTOP to TID 18514 ... done. Attached to TID 18516.
Peeking the counters in the child process: counter[0] = 77012930 counter[1] = 183059526 counter[2] = 344043770 counter[3] = 71120227 Waiting a short moment ... and another peek: counter[0] = 77012930 counter[1] = 183059526 counter[2] = 344043770 counter[3] = 71120227
Sending SIGCONT to TID 18514 ... done. Attached to TID 18516.
Peeking the counters in the child process: counter[0] = 88082419 counter[1] = 194059048 counter[2] = 359342314 counter[3] = 84887463 Waiting a short moment ... and another peek: counter[0] = 100420161 counter[1] = 194059048 counter[2] = 392540525 counter[3] = 111770366 |
不幸的是,但是正如预期的那样,处置(停止/运行)是整个过程的,而不是特定于线程的,如您在上面看到的。这意味着要停止特定线程并让其他线程正常运行,您需要分别对要停止的线程PTHREAD_ATTACH。
为了证明我上面的所有陈述,您可能必须添加测试用例;我最终得到了相当多的代码副本,所有代码都进行了稍微的编辑,以测试全部内容,而且我不确定我是否选择了最完整的代码集。如果您发现有遗漏,我很乐意扩展测试程序。
有什么问题吗
相关讨论
进程中的每个线程都被单独跟踪(并且每个线程都可能通过不同的跟踪过程进行跟踪,或者被跟踪)。当您调用ptrace attach时,您总是只附加到一个线程。只有该线程将停止-其他线程将继续按原样运行。
ptrace()手册页的最新版本非常清楚:
Attachment and subsequent commands are per thread: in a multithreaded
process, every thread can be individually attached to a (potentially
different) tracer, or left not attached and thus not debugged.
Therefore,"tracee" always means"(one) thread", never"a (possibly
multithreaded) process". Ptrace commands are always sent to a
specific tracee using a call of the form
1 |
ptrace(PTRACE_foo, pid, ...) |
where pid is the thread ID of the corresponding Linux thread.
(Note that in this page, a"multithreaded process" means a thread
group consisting of threads created using the clone(2)
CLONE_THREAD flag.)
单步执行仅影响您将其定向到的线程。如果其他线程正在运行,则它们将继续运行,如果它们在跟踪停止中,则它们将保持在跟踪停止状态。 (这意味着,如果您正在单步执行的线程尝试获取互斥量或另一个未运行线程所拥有的类似同步资源,则它将无法获取该互斥量)。
如果要在单步执行一个线程时停止该进程的所有线程,则将需要附加到所有线程。更为复杂的是,如果在尝试附加到进程时进程正在运行,则在枚举线程时可能会创建新线程。
相关讨论
Does it stop all the process's threads?
是
它跟踪该进程,该进程的所有线程都停止。
想象一下,这不是您如何在IDE中看到不同的线程。
从手册:
The ptrace() system call provides a means by which one process (the"tracer") may observe and control the execution of another process (the"tracee")
附带的示例代码:
1 2 3 4 5 6 |
printf("Attaching to process %d ",Tpid); if ((ptrace(PTRACE_ATTACH, Tpid, 0, 0)) != 0) {; printf("Attach result %d ",res); } |
因此,是的,您可以使用一个线程,是的,它将停止该进程的所有线程。
1 2 3 4 5 |
if ((res = ptrace(PTRACE_SINGLESTEP, Tpid, 0, signo)) < 0) { perror("Ptrace singlestep error"); exit(1); } res = wait(&stat); |
原链接:关于C#:如何使用PTRACE获得多个线程的一致视图? | 码农家园