这两天又遇到了一个奇怪的问题:简单的说,就是在 vmware 通过 rabbitmq 的 init 脚本无法正常 stop 掉相应的进程(实际上调用的是 rabbitmqctl stop <pidfile> )。
问题表现
确认 RabbitMQ 的运行状态和对应的 pid 文件内容[root@gfsd1 ~]#
[root@gfsd1 ~]# ps aux|grep rabbit
root 11813 0.0 0.0 106368 1004 pts/2 S 19:21 0:00 /bin/sh /etc/init.d/rabbitmq start
root 11815 0.0 0.0 106100 1200 pts/2 S 19:21 0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/local/rabbitmq/sbin/rabbitmq-server
root 11817 46.1 0.3 2806172 53252 pts/2 Sl 19:21 0:07 /usr/local/rabbitmq/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rabbitmq@rmq_1 -boot start_sasl -config /usr/local/rabbitmq/sbin/../etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1.log"} -rabbit sasl_error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1-sasl.log"} -rabbit enabled_plugins_file "/usr/local/rabbitmq/sbin/../etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/local/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1"
root 12534 0.0 0.0 103252 836 pts/2 S+ 19:21 0:00 grep rabbit
[root@gfsd1 ~]#
[root@gfsd1 ~]#
[root@gfsd1 ~]# cat /var/run/rabbitmq/pid
11817
[root@gfsd1 ~]#
通过 init 脚本 stop 服务 [root@gfsd1 ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: /var/run/rabbitmq/pid
Stopping and halting node rabbitmq@rmq_1 ...
(卡住)
查看日志,发现 RabbitMQ 服务已经执行了停止动作。 [root@gfsd1 ~]# tail -f /usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq\@rmq_1.log
...
=INFO REPORT==== 14-Mar-2016::19:21:38 ===
Stopping RabbitMQ
=INFO REPORT==== 14-Mar-2016::19:21:38 ===
stopped TCP Listener on [::]:6672
=ERROR REPORT==== 14-Mar-2016::19:21:38 ===
AMQP connection <0.424.0> (running), channel 0 - error:
{amqp_error,connection_forced,
"broker forced connection closure with reason 'shutdown'",none}
=INFO REPORT==== 14-Mar-2016::19:21:38 ===
Halting Erlang VM
对应代码在 rabbit.erl 中,如下所示 可以看到,最终调用了 init:stop() 进行了停止。
然而,此时虽然 RabbitMQ 服务进程已经退出,但调用 rabbitmqctl stop <pidfile> 进程却退不出来!
[root@gfsd1 ~]#
[root@gfsd1 ~]# ps ajxf|grep rabbit
27541 13082 13082 27541 pts/2 13082 S+ 0 0:00 | | \_ /bin/sh /etc/init.d/rabbitmq stop
13082 13127 13082 27541 pts/2 13082 Sl+ 0 0:00 | | \_ /usr/local/rabbitmq/erts-6.0/bin/beam.smp -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -hidden -sname rabbitmqctl13127 -boot start_clean -s rabbit_control_main -nodename rabbitmq@rmq_1 -extra stop /var/run/rabbitmq/pid
12431 13687 13686 12431 pts/5 13686 S+ 0 0:00 | \_ grep rabbit
[root@gfsd1 ~]#
通过 Ctrl+C 停止调用后,可以看到 rabbitmqctl stop <pidfile> 调用进程才停止。 [root@gfsd1 ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: /var/run/rabbitmq/pid
Stopping and halting node rabbitmq@rmq_1 ...
(卡住)
^C
BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded
(v)ersion (k)ill (D)b-tables (d)istribution
remove pid file and dir
rabbitmq-server.
[root@gfsd1 ~]# ^C
[root@gfsd1 ~]#
[root@gfsd1 ~]#
[root@gfsd1 ~]# ps ajxf|grep rabbit
27541 1658 1657 27541 pts/2 1657 S+ 0 0:00 | | \_ grep rabbit
[root@gfsd1 ~]#
[root@gfsd1 ~]#
[root@gfsd1 ~]# ll /var/run/rabbitmq/pid
ls: cannot access /var/run/rabbitmq/pid: No such file or directory
[root@gfsd1 ~]#
问题分析
既然 RabbitMQ 服务能够正常停止,那么问题就只有一个:为什么 rabbitmqctl stop <pidfile> 会退不出来?让我们 查看下相关代码在 rabbitmqctl 脚本中
...
exec ${ERL_DIR}erl \
-pa "${RABBITMQ_HOME}/ebin" \
-noinput \
-hidden \
${RABBITMQ_CTL_ERL_ARGS} \
-sname rabbitmqctl$$ \
-boot "${CLEAN_BOOT_FILE}" \
-s rabbit_control_main \ # 入口函数
-nodename $RABBITMQ_NODENAME \
-extra "$@"
在 rabbit_control_main.erl 中 ...
-define(EXTERNAL_CHECK_INTERVAL, 1000).
...
start() ->
...
{Command, Opts, Args} =
case parse_arguments(init:get_plain_arguments(), NodeStr) of
{ok, Res} -> Res;
no_command -> print_error("could not recognise command", []),
usage()
end,
...
%% The reason we don't use a try/catch here is that rpc:call turns
%% thrown errors into normal return values
case catch action(Command, Node, Args, Opts, Inform) of
ok ->
case Quiet of
true -> ok;
false -> io:format("...done.~n")
end,
rabbit_misc:quit(0);
{ok, Info} ->
case Quiet of
true -> ok;
false -> io:format("...done (~p).~n", [Info])
end,
rabbit_misc:quit(0);
...
end.
对应到实际情况,我们的 Command 为 stop ,所以会调用 ...
action(stop, Node, Args, _Opts, Inform) ->
Inform("Stopping and halting node ~p", [Node]),
Res = call(Node, {rabbit, stop_and_halt, []}),
case {Res, Args} of
{ok, [PidFile]} -> wait_for_process_death( %% 等待指定 pid 的进程退出
read_pid_file(PidFile, false)); %% 读取 pid 文件的内容
{ok, [_, _| _]} -> exit({badarg, Args});
_ -> ok
end,
Res;
...
wait_for_process_death(Pid) ->
case process_up(Pid) of
true -> timer:sleep(?EXTERNAL_CHECK_INTERVAL), %% 只要认为指定 pid 的进程未退出,则每隔 1s 检查一次
wait_for_process_death(Pid);
false -> ok
end.
...
% Test using some OS clunkiness since we shouldn't trust
% rpc:call(os, getpid, []) at this point
process_up(Pid) ->
with_os([{unix, fun () ->
run_ps(Pid) =:= 0 %% Linux 中会执行这里。从上面的判定知道,只有不等于 0 时,才认为目标进程已经退出
end},
{win32, fun () ->
Cmd = "tasklist /nh /fi \"pid eq " ++ Pid ++ "\" ",
Res = rabbit_misc:os_cmd(Cmd ++ "2>&1"),
case re:run(Res, "erl\\.exe", [{capture, none}]) of
match -> true;
_ -> false
end
end}]).
with_os(Handlers) ->
{OsFamily, _} = os:type(),
case proplists:get_value(OsFamily, Handlers) of
undefined -> throw({unsupported_os, OsFamily});
Handler -> Handler()
end.
run_ps(Pid) ->
Port = erlang:open_port({spawn, "ps -p " ++ Pid}, %% 这里就是整个问题的关键了
[exit_status, {line, 16384},
use_stdio, stderr_to_stdout]),
exit_loop(Port).
exit_loop(Port) ->
receive
{Port, {exit_status, Rc}} -> Rc; %% 对应上面,只有 Rc 不等于 0 时,才会认为目标进程已退出
{Port, _} -> exit_loop(Port)
end.
知道了代码执行流程,简单添加一些关键打印,重新测试如下 问题复现
存在问题的环境[root@gfsd1 ~]#
[root@gfsd1 ~]# ps aux|grep rabbit
root 11813 0.0 0.0 106368 1004 pts/2 S 19:21 0:00 /bin/sh /etc/init.d/rabbitmq start
root 11815 0.0 0.0 106100 1200 pts/2 S 19:21 0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/local/rabbitmq/sbin/rabbitmq-server
root 11817 46.1 0.3 2806172 53252 pts/2 Sl 19:21 0:07 /usr/local/rabbitmq/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rabbitmq@rmq_1 -boot start_sasl -config /usr/local/rabbitmq/sbin/../etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1.log"} -rabbit sasl_error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1-sasl.log"} -rabbit enabled_plugins_file "/usr/local/rabbitmq/sbin/../etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/local/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1"
root 12534 0.0 0.0 103252 836 pts/2 S+ 19:21 0:00 grep rabbit
[root@gfsd1 ~]#
[root@gfsd1 ~]#
[root@gfsd1 ~]# cat /var/run/rabbitmq/pid
11817
[root@gfsd1 ~]#
[root@gfsd1 ~]#
[root@gfsd1 ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: /var/run/rabbitmq/pid
Stopping and halting node rabbitmq@rmq_1 ...
Args = ["/var/run/rabbitmq/pid"]
...
action_stop => Res = ok Args = ["/var/run/rabbitmq/pid"]
...
read_pid_file => PidFile : "/var/run/rabbitmq/pid" Wait = false
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Data = {data,{eol,"11817 pts/2 00:00:08 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Data = {data,{eol,"11817 pts/2 00:00:08 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Exit = {exit_status,0}
...
(永远停不下来)
...
^C
BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded
(v)ersion (k)ill (D)b-tables (d)istribution
^Cremove pid file and dir
rabbitmq-server.
[root@gfsd1 ~]# ^C
[root@gfsd1 ~]#
可以看到无论执行多少次循环,永远返回 {exit_status,0} ,所以导致 rabbitmqctl stop <pidfile> 无法停止。 正常环境中的输出
[root@Betty ~]#
[root@Betty ~]# /etc/init.d/rabbitmq start
Starting rabbitmq-server: SUCCESS
rabbitmq-server.
[root@Betty ~]#
[root@Betty ~]#
[root@Betty ~]# ps aux|grep rabbit
root 2431 0.0 0.0 106456 1004 pts/3 S 14:40 0:00 /bin/sh /etc/init.d/rabbitmq start
root 2434 0.0 0.0 106092 1172 pts/3 S 14:40 0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/sbin/rabbitmq-server
root 2436 42.8 1.4 2296732 56332 pts/3 Sl 14:40 0:02 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672
root 2567 0.0 0.0 103256 856 pts/3 S+ 14:40 0:00 grep rabbit
[root@Betty ~]#
[root@Betty ~]#
[root@Betty ~]# cat /var/run/rabbitmq/pid
2436
[root@Betty ~]#
[root@Betty ~]#
[root@Betty ~]# ps -p 2436
PID TTY TIME CMD
2436 pts/3 00:00:02 beam.smp
[root@Betty ~]#
[root@Betty ~]#
[root@Betty ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: Stopping and halting node rmq_betty@Betty ...
Args = ["/var/run/rabbitmq/pid"]
...
read_pid_file => PidFile : "/var/run/rabbitmq/pid" Wait = false
wait_for_process_death Pid = "2436"
run_ps => erlang:open_port -> ps -p "2436"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Data = {data,{eol," 2436 pts/3 00:00:02 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "2436"
run_ps => erlang:open_port -> ps -p "2436"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Data = {data,{eol," 2436 pts/3 00:00:02 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "2436"
run_ps => erlang:open_port -> ps -p "2436"
exit_loop => Data = {data,{eol," PID TTY TIME CMD"}}
exit_loop => Exit = {exit_status,1}
...done.
rabbitmq-server.
[root@Betty ~]#
可以看到,循环检测 3 次后就返回了 {exit_status,1} ,从而 rabbitmqctl stop <pidfile> 能够正常退出。 问题深入
深入研究一下 erlang:open_port/2 的使用手册内容如下:
-=-=-=-=-=- 我是开始研究大数据问题的分隔线 -=-=-=-=-=-=-
open_port(PortName, PortSettings) -> port()
Returns a port identifier as the result of opening a new Erlang port. A port can be seen as an external Erlang process.
PortName is one of the following:
{spawn, Command}
Starts an external program. Command is the name of the external program which will be run. Command runs
outside the Erlang work space unless an Erlang driver with the name Command is found. If found, that driver will
be started. A driver runs in the Erlang workspace, which means that it is linked with the Erlang runtime system.
When starting external programs on Solaris, the system call vfork is used in preference to fork for performance
reasons, although it has a history of being less robust. If there are problems with using vfork, setting the
environment variable ERL_NO_VFORK to any value will cause fork to be used instead.
For external programs, the PATH is searched (or an equivalent method is used to find programs, depending on
operating system). This is done by invoking the shell on certain platforms. The first space separated token of
the command will be considered as the name of the executable (or driver). This (among other things) makes
this option unsuitable for running programs having spaces in file or directory names. Use {spawn_executable,
Command} instead if spaces in executable file names is desired.
{line, L}
Messages are delivered on a per line basis. Each line (delimited by the OS-dependent newline sequence) is
delivered in one single message. The message data format is {Flag, Line}, where Flag is either eol or
noeol and Line is the actual data delivered (without the newline sequence).
消息以行为基础单位进行投递;
每一行输出内容(由操作系统相关的换行序列确定)作为单独一条消息投递;
消息数据格式为 {Flag, Line} ,其中 Flag 是 eol 或者 noeol ;Line 为实际投递的数据内容(不包含换行序列)
L specifies the maximum line length in bytes. Lines longer than this will be delivered in more than one message,
with the Flag set to noeol for all but the last message. If end of file is encountered anywhere else than
immediately following a newline sequence, the last line will also be delivered with the Flag set to noeol. In
all other cases, lines are delivered with Flagset to eol.
L 指定了以字节为单位的最大行内容长度;
超过规定长度的行,将会以多条消息进行投递,除最后一条消息外,均设置 Flag 为 noeol ;
若遇到了文件的结束,但没有换行序列,则最后一行内容同样会被投递,且设置 Flag 为 noeol ;
在其他情况下,行数据会设置 Flag 为 eol 进行投递。
The {packet, N} and {line, L} settings are mutually exclusive.
exit_status
This is only valid for {spawn, Command} where Command refers to an external program, and for
{spawn_executable, Command}.
When the external process connected to the port exits, a message of the form {Port,
{exit_status,Status}} is sent to the connected process, where Status is the exit status of the external
process. If the program aborts, on Unix the same convention is used as the shells do (i.e., 128+signal).
If the eofoption has been given as well, the eofmessage and the exit_statusmessage appear in an
unspecified order.
当与 port 连接的外部进程退出时,形式为 {Port,{exit_status,Status}} 的消息会被发送到 erlang 进程,其中 Status 的内容为外部进程的退出状态;
如果外部程序主动 abort 了,在 Unix 系统中,将采用类似 shell 形式的退出码策略(即 128+signal)
If the port program closes its stdout without exiting, the exit_status option will not work.
如果 port 程序关闭了其 stdout 却没有退出,则 exit_status 选项不起作用。
use_stdio
This is only valid for {spawn, Command}and {spawn_executable, Command}. It allows the standard
input and output (file descriptors 0 and 1) of the spawned (UNIX) process for communication with Erlang.
该选项允许被创建 Unix 进程的 stdin 和 stdout 可以和 Erlang 进行通信。
stderr_to_stdout
Affects ports to external programs. The executed program gets its standard error file redirected to its standard
output file. stderr_to_stdout and nouse_stdio are mutually exclusive.
该选项对外部程序的 port 产生影响;
可执行程序的 stderr 会被重定向到其 stdout 上;
-=-=-=-=-=- 我是开始研究大数据问题的分隔线 -=-=-=-=-=-=-
从手册中可以知道,RabbitMQ 代码中是通过 Erlang Port 调用操作系统的命令,并根据返回结果的内容来判定对应的进程是否已停止的。
但在我们的环境中却遇到了外部进程退出状态错误的问题。
解决办法
1.修改 init 脚本,将调用 rabbitmqctl stop <pidfile> 的地方,改成 rabbitmqctl stop(需要相应调整针对 pidfile 的各种条件判定)
2.不要使用 VMware 虚拟机,因为在 KVM 虚拟机上没有发现此问题(VMware 粉不要喷我~)
---------------------------- ---------------------------- ---------------------------- ----------------------------
重大发现:根本原因是服务器被攻击了,ps 文件被替换掉了!
【被攻击机器】
确认问题 ps 的退出码
[root@gfsd1 ~]#
[root@gfsd1 ~]# ps -p 123
PID TTY TIME CMD
123 ? 00:00:00 kthrotld/1
[root@gfsd1 ~]#
[root@gfsd1 ~]# echo $?
0
[root@gfsd1 ~]#
[root@gfsd1 ~]# ps -p 1234
PID TTY TIME CMD
[root@gfsd1 ~]#
[root@gfsd1 ~]# echo $?
0 -- 正常的 ps 程序,此处应该返回 1
[root@gfsd1 ~]#
确认问题 ps 程序有哪些特征 [root@gfsd1 ~]# uname -a
Linux gfsd1 2.6.32-431.el6.x86_64 #1 SMP Fri Nov 22 03:15:09 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux
[root@gfsd1 ~]#
ps 文件大小和正常机器上不同
[root@gfsd1 ~]# ll /bin/ps
-rwxr-xr-x 1 root root 1223123 Mar 18 10:42 /bin/ps
[root@gfsd1 ~]#
[root@gfsd1 ~]#
64bit 机器上出现了 32bit 的 ps 文件
[root@gfsd1 ~]# file /bin/ps
/bin/ps: ELF 32-bit LSB executable, Intel 80386, version 1 (SYSV), statically linked, for GNU/Linux 2.2.5, not stripped
[root@gfsd1 ~]#
正常机器上 ps 是动态链接的
[root@gfsd1 ~]# ldd /bin/ps
not a dynamic executable
[root@gfsd1 ~]#
跟踪问题 ps 的执行 [root@gfsd1 ~]#
[root@gfsd1 ~]# strace -f ps -p 1234
execve("/bin/ps", ["ps", "-p", "1234"], [/* 32 vars */]) = 0 -- 调用问题 ps 程序
[ Process PID=14740 runs in 32 bit mode. ]
...
set_thread_area(0xffc3db64) = 0
set_tid_address(0xa12c878) = 14740
...
getrlimit(RLIMIT_STACK, {rlim_cur=-4284481536, rlim_max=564187122106368*1024}) = 0
_sysctl({0x2081076ec, -3941488, (nil), (nil), (nil), 18429817727855099905}) = 0
...
close(3) = -1 EBADF (Bad file descriptor)
...
close(1023) = -1 EBADF (Bad file descriptor)
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
stat64("/bin/ps", {st_mode=S_IFREG|0755, st_size=1223123, ...}) = 0
getppid() = 14734
readlink("/proc/14734/exe", "/usr/bin/strace", 255) = 15
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
access("/usr/bin/dpkgd/ps", F_OK) = 0
pipe([3, 4]) = 0
clone(Process 14845 attached
child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0) = 14845 -- 创建了一个线程(正常 ps 没有该行为)
...
[pid 14845] dup2(4, 1 <unfinished ...>
[pid 14740] <... fstat64 resumed> {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
[pid 14845] <... dup2 resumed> ) = 1
[pid 14740] mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
[pid 14845] close(4) = 0
...
[pid 14845] execve("/bin/sh", ["sh", "-c", "/usr/bin/dpkgd/ps -p 1234"], [/* 40 vars */] <unfinished ...> -- 在新建线程中调用的是 /usr/bin/dpkgd/ 目录下的 ps 文件!!!
[pid 14740] read(3, <unfinished ...>
[pid 14845] <... execve resumed> ) = 0
[ Process PID=14845 runs in 64 bit mode. ]
...
[pid 14845] getdents(4, /* 281 entries */, 32768) = 7352
[pid 14845] stat("/proc/1", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
[pid 14845] open("/proc/1/stat", O_RDONLY) = 5
[pid 14845] read(5, "1 (init) S 0 1 1 0 -1 4202752 30"..., 1023) = 260
[pid 14845] close(5) = 0
[pid 14845] open("/proc/1/status", O_RDONLY) = 5
[pid 14845] read(5, "Name:\tinit\nState:\tS (sleeping)\nT"..., 1023) = 870
[pid 14845] close(5) = 0
[pid 14845] stat("/proc/2", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
[pid 14845] open("/proc/2/stat", O_RDONLY) = 5
[pid 14845] read(5, "2 (kthreadd) S 0 0 0 0 -1 214961"..., 1023) = 155
[pid 14845] close(5) = 0
[pid 14845] open("/proc/2/status", O_RDONLY) = 5
[pid 14845] read(5, "Name:\tkthreadd\nState:\tS (sleepin"..., 1023) = 656
[pid 14845] close(5) = 0
...
[pid 14845] stat("/proc/28465", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
[pid 14845] open("/proc/28465/stat", O_RDONLY) = 5
[pid 14845] read(5, "28465 (inet_gethost) S 28451 284"..., 1023) = 237
[pid 14845] close(5) = 0
[pid 14845] open("/proc/28465/status", O_RDONLY) = 5
[pid 14845] read(5, "Name:\tinet_gethost\nState:\tS (sle"..., 1023) = 884
[pid 14845] close(5) = 0
[pid 14845] getdents(4, /* 0 entries */, 32768) = 0
[pid 14845] close(4) = 0
[pid 14845] fstat(1, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
[pid 14845] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4528922000
[pid 14845] write(1, " PID TTY TIME CMD\n", 28) = 28
[ Process PID=14740 runs in 32 bit mode. ]
[pid 14740] <... read resumed> " PID TTY TIME CMD\n", 4096) = 28
[ Process PID=14845 runs in 64 bit mode. ]
[pid 14845] exit_group(1) = ? -- 线程中的 ps 退出码为 1
Process 14845 detached
[ Process PID=14740 runs in 32 bit mode. ]
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xfffffffff778a000
--- SIGCHLD (Child exited) @ 0 (0) ---
write(1, " PID TTY TIME CMD\n", 28 PID TTY TIME CMD
) = 28
read(3, "", 4096) = 0
close(3) = 0
waitpid(14845, [{WIFEXITED(s) && WEXITSTATUS(s) == 1}], 0) = 14845 -- 可以看到,问题 ps 成功拿到了正确 ps 的退出码
munmap(0xf778b000, 4096) = 0
munmap(0xf778a000, 4096) = 0
exit_group(0) = ? -- 但问题 ps 最终以退出码 0 退出
[root@gfsd1 ~]#
由上面的信息,可以知道正确的 ps 程序被搬移到了下面的位置 [root@gfsd1 ~]# ll /usr/bin/dpkgd/
total 436
-rwxr-xr-x 1 root root 145872 Jan 28 05:00 lsof
-rwxr-xr-x 1 root root 128192 Jan 28 05:00 netstat
-rwxr-xr-x 1 root root 87088 Jan 28 05:00 ps
-rwxr-xr-x 1 root root 74840 Jan 28 05:00 ss
[root@gfsd1 ~]#
[root@gfsd1 ~]# file /usr/bin/dpkgd/*
/usr/bin/dpkgd/lsof: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
/usr/bin/dpkgd/netstat: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
/usr/bin/dpkgd/ps: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
/usr/bin/dpkgd/ss: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
[root@gfsd1 ~]#
[root@gfsd1 ~]#
[root@gfsd1 ~]# ldd /usr/bin/dpkgd/*
/usr/bin/dpkgd/lsof:
linux-vdso.so.1 => (0x00007fff86bff000)
libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000)
libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000)
/lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
/usr/bin/dpkgd/netstat:
linux-vdso.so.1 => (0x00007fff3ddc2000)
libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000)
libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000)
/lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
/usr/bin/dpkgd/ps:
linux-vdso.so.1 => (0x00007fff961b6000)
libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000)
libproc-3.2.8.so => /lib64/libproc-3.2.8.so (0x0000003ef8600000)
libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000)
/lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
/usr/bin/dpkgd/ss:
linux-vdso.so.1 => (0x00007fffb51e1000)
libresolv.so.2 => /lib64/libresolv.so.2 (0x0000003ef9e00000)
libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
/lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
[root@gfsd1 ~]#
可以看到,被攻击的可执行程序不止 ps 一个... 【正常机器】
正常 ps 程序的特征
[root@YOYO ~]# uname -a
Linux YOYO 2.6.32-358.el6.x86_64 #1 SMP Fri Feb 22 00:31:26 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux
[root@YOYO ~]#
[root@YOYO ~]# ll /bin/ps
-rwxr-xr-x. 1 root root 87088 11月 15 2012 /bin/ps
[root@YOYO ~]#
[root@YOYO ~]# file /bin/ps
/bin/ps: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
[root@YOYO ~]#
[root@YOYO ~]# ldd /bin/ps
linux-vdso.so.1 => (0x00007fffb6d24000)
libselinux.so.1 => /lib64/libselinux.so.1 (0x00000034e7000000)
libproc-3.2.8.so => /lib64/libproc-3.2.8.so (0x0000003f38400000)
libc.so.6 => /lib64/libc.so.6 (0x0000003f38000000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000003f37c00000)
/lib64/ld-linux-x86-64.so.2 (0x0000003f37800000)
[root@YOYO ~]#
跟踪正常 ps 程序的执行 [root@YOYO ~]# strace -f ps -p 1234
execve("/bin/ps", ["ps", "-p", "1234"], [/* 28 vars */]) = 0
...
getdents(4, /* 241 entries */, 32768) = 6192
stat("/proc/1", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
open("/proc/1/stat", O_RDONLY) = 5
read(5, "1 (init) S 0 1 1 0 -1 4202752 29"..., 1023) = 256
close(5) = 0
open("/proc/1/status", O_RDONLY) = 5
read(5, "Name:\tinit\nState:\tS (sleeping)\nT"..., 1023) = 866
close(5) = 0
stat("/proc/2", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
open("/proc/2/stat", O_RDONLY) = 5
read(5, "2 (kthreadd) S 0 0 0 0 -1 214961"..., 1023) = 156
close(5) = 0
open("/proc/2/status", O_RDONLY) = 5
read(5, "Name:\tkthreadd\nState:\tS (sleepin"..., 1023) = 654
close(5) = 0
...
stat("/proc/25701", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
open("/proc/25701/stat", O_RDONLY) = 5
read(5, "25701 (bash) S 25697 25701 25701"..., 1023) = 257
close(5) = 0
open("/proc/25701/status", O_RDONLY) = 5
read(5, "Name:\tbash\nState:\tS (sleeping)\nT"..., 1023) = 879
close(5) = 0
getdents(4, /* 0 entries */, 32768) = 0
close(4) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 2), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4d2da3c000
write(1, " PID TTY TIME CMD\n", 28 PID TTY TIME CMD
) = 28
exit_group(1) = ?
[root@YOYO ~]#
可以看到正常情况下 ps -p xxx 是以退出码 1 退出的。 与此木马程序相关的文章:这里 。
系统漏洞相关文档:《NTRG_ElasticBotnetReport_06102015》
漏洞号:CVE-2015-1427
问题 ps 中的符号片段
...
08048b30 T CAttackIe::UpdateCurVariant(unsigned int)
08048b36 T CTcpAttack::UpdateCurVariant(unsigned int)
08048be4 T CAttackIcmp::MakePacket(unsigned int)
08048c5a T CAttackSyn::MakePacket(unsigned int)
08048d02 T CAttackUdp::MakePacket(unsigned int)
08048dce T CPacketAttack::SendPacket()
08048eac T CPacketAttack::Create()
08048f7e T CAttackAmp::Create()
08049070 T CPacketAttack::UpdateCurVariant(unsigned int)
08049306 T CTcpAttack::Do(unsigned int)
08049546 T CAttackPrx::MakePacket(unsigned int)
080497d6 T CAttackAmp::MakePacket(unsigned int)
0804996c T CAttackDns::MakePacket(unsigned int)
08049bf8 T CAttackCc::Destroy()
08049cc8 T CAttackCc::Create()
08049e8c T CAttackTns::Destroy()
08049f5c T CAttackTns::Create()
0804a05a T CAttackPrx::Create()
0804a2a6 T CAttackDns::Create()
0804a4f2 T CAttackCompress::CreatePackets()
0804a6ec T CAttackCompress::MakePacket(unsigned int)
0804a8be T CAttackCompress::Create()
...