nested loop,merge join,hash join与子查询优化


-- 今天见到一条sql,大致意思为:A 表 left join B 表,要查出A表所有的数据,以及统计所有A表与B表相关行数
create table t1 (id int , name varchar(50),password varchar(50));
insert into t1 select id,concat(id,'rudy'),concat('password',id) from generate_series(1,100000) id;
alter table t1 add primary key(id);
create table t2 (id int , name varchar(50),password varchar(50),ref_id int);
insert into t2 select id,concat(id,'rudy'),concat('password',id),trunc(random()*1000) from generate_series(1,10000000) id;
alter table t2 add primary key(id);
create index on t2(ref_id);
--查询结果类似如下
postgres=# select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id  order by id offset 800 limit 10;
 id  |  name   |  password   | count 
-----+---------+-------------+-------
 801 | 801rudy | password801 | 10119
 802 | 802rudy | password802 |  9933
 803 | 803rudy | password803 | 10011
 804 | 804rudy | password804 |  9742
 805 | 805rudy | password805 |  9990
 806 | 806rudy | password806 | 10024
 807 | 807rudy | password807 |  9806
 808 | 808rudy | password808 | 10103
 809 | 809rudy | password809 |  9855
 810 | 810rudy | password810 |  9915
(10 rows)

Time: 1719.877 ms
--需要1.7s

--有没有办法优化呐?sql执行计划如下
postgres=# explain
postgres-# select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id  order by id offset 800 limit 10;
                                                QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
 Limit  (cost=3824.83..3868.72 rows=10 width=26)
   ->  GroupAggregate  (cost=313.12..439276.63 rows=100000 width=26)
         ->  Merge Left Join  (cost=313.12..388277.26 rows=9999874 width=26)
               Merge Cond: (t1.id = t2.ref_id)
               ->  Index Scan using t1_pkey on t1  (cost=0.29..3342.29 rows=100000 width=26)
               ->  Index Only Scan using t2_ref_id_idx on t2  (cost=0.43..259686.54 rows=9999874 width=4)

--发现sql过早了关联t2表,而且需要排序(merge操作)
 
--因为t1只需要返回10条数据(则可以延迟关联t2表),如果sql的执行计划是下面的步骤会更好
1. 对t1表执行order 排序
2. 对t1表取出offset 800 limit 10 的行
3. 拿t1表中的10行与t2 join 计算出count




--sql修改如下,去掉groupr操作,把t2表关联变成子查询
postgres=# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id offset 800 limit 10;               
 id  |  name   |  password   |  cnt  
-----+---------+-------------+-------
 801 | 801rudy | password801 | 10119
 802 | 802rudy | password802 |  9933
 803 | 803rudy | password803 | 10011
 804 | 804rudy | password804 |  9742
 805 | 805rudy | password805 |  9990
 806 | 806rudy | password806 | 10024
 807 | 807rudy | password807 |  9806
 808 | 808rudy | password808 | 10103
 809 | 809rudy | password809 |  9855
 810 | 810rudy | password810 |  9915
(10 rows)

Time: 1048.731 ms
--sql执行需要1s
--通过执行计划可知,没有了排序操作,但是需要很早的关联了t2表
postgres=# explain
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id offset 800 limit 10;
                                              QUERY PLAN                                               
-------------------------------------------------------------------------------------------------------
 Limit  (cost=249983.03..253107.81 rows=10 width=26)
   ->  Index Scan using t1_pkey on t1  (cost=0.29..31247842.29 rows=100000 width=26)
         SubPlan 1
           ->  Aggregate  (cost=312.44..312.44 rows=1 width=0)
                 ->  Index Only Scan using t2_ref_id_idx on t2  (cost=0.43..287.44 rows=10000 width=0)
                       Index Cond: (ref_id = t1.id)
(6 rows)



--sql修改成,把t1表嵌套成一层,先返回t1表的数据
postgres=# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select * from t1 order by id offset 800 limit 10) as t1 ;
 id  |  name   |  password   |  cnt  
-----+---------+-------------+-------
 801 | 801rudy | password801 | 10119
 802 | 802rudy | password802 |  9933
 803 | 803rudy | password803 | 10011
 804 | 804rudy | password804 |  9742
 805 | 805rudy | password805 |  9990
 806 | 806rudy | password806 | 10024
 807 | 807rudy | password807 |  9806
 808 | 808rudy | password808 | 10103
 809 | 809rudy | password809 |  9855
 810 | 810rudy | password810 |  9915
(10 rows)

Time: 111.015 ms
--执行时间0.1s
--通过执行计划可知,延迟关联t2表,对t2表的循环只需要10次
postgres=# explain 
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select * from t1 order by id offset 800 limit 10) as t1 ;
                                           QUERY PLAN                                            
-------------------------------------------------------------------------------------------------
 Subquery Scan on t1  (cost=27.03..3151.91 rows=10 width=26)
   ->  Limit  (cost=27.03..27.36 rows=10 width=26)
         ->  Index Scan using t1_pkey on t1 t1_1  (cost=0.29..3342.29 rows=100000 width=26)
   SubPlan 1
     ->  Aggregate  (cost=312.44..312.44 rows=1 width=0)
           ->  Index Only Scan using t2_ref_id_idx on t2  (cost=0.43..287.44 rows=10000 width=0)
                 Index Cond: (ref_id = t1.id)
Time: 0.682 ms 




--那是不是以上sql查询中第三个sql是最优化的呐,答案是不一定

--假如sql需要返回大量的数据时,比如我们把limit条件去掉

-- cost 消耗 31247842
postgres=# explain
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id offset 800 ;
                                              QUERY PLAN                                               
-------------------------------------------------------------------------------------------------------
 Limit  (cost=249983.03..31247842.29 rows=99200 width=26)
   ->  Index Scan using t1_pkey on t1  (cost=0.29..31247842.29 rows=100000 width=26)
         SubPlan 1
           ->  Aggregate  (cost=312.44..312.44 rows=1 width=0)
                 ->  Index Only Scan using t2_ref_id_idx on t2  (cost=0.43..287.44 rows=10000 width=0)
                       Index Cond: (ref_id = t1.id)
(6 rows)

Time: 0.790 ms
-- cost 消耗 439276
postgres=# explain
postgres-# select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id  order by id offset 800 ;
                                                QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
 Limit  (cost=3824.83..439276.63 rows=99200 width=26)
   ->  GroupAggregate  (cost=313.12..439276.63 rows=100000 width=26)
         ->  Merge Left Join  (cost=313.12..388277.26 rows=9999874 width=26)
               Merge Cond: (t1.id = t2.ref_id)
               ->  Index Scan using t1_pkey on t1  (cost=0.29..3342.29 rows=100000 width=26)
               ->  Index Only Scan using t2_ref_id_idx on t2  (cost=0.43..259686.54 rows=9999874 width=4)
(6 rows)

Time: 0.856 ms
-- cost 消耗 30998878 
postgres=# explain 
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select * from t1 order by id offset 800 ) as t1 ;
                                           QUERY PLAN                                            
-------------------------------------------------------------------------------------------------
 Subquery Scan on t1  (cost=27.03..30998878.29 rows=99200 width=26)
   ->  Limit  (cost=27.03..3342.29 rows=99200 width=26)
         ->  Index Scan using t1_pkey on t1 t1_1  (cost=0.29..3342.29 rows=100000 width=26)
   SubPlan 1
     ->  Aggregate  (cost=312.44..312.44 rows=1 width=0)
           ->  Index Only Scan using t2_ref_id_idx on t2  (cost=0.43..287.44 rows=10000 width=0)
                 Index Cond: (ref_id = t1.id)
(7 rows)

--通过执行计划可知,此时sql效率最好的又变成了第2条sql查询
--因为很显然在需要大量行返回时,pg先把数据排序好,再通过merge join选出符合条件的数据,远比第一条sql或第3条sql通过循环子查询要好(子查询要循环99200行,如果t1表有1000000行数据时,循环子查询将是灾难)
--还是要感谢pg提供了hash join,merge join,nested loop 三种表连接









--同样的情况对于只有nested loop表连接的sql是什么样的呐
--构造测试数据如下
create table t1 (id int primary key, name varchar(50),password varchar(50));
insert into t1 select id,concat(id,'rudy'),concat('password',id) from nums where id<=100000;
create table t2 (id int primary key, name varchar(50),password varchar(50),ref_id int);
insert into t2 select id,concat(id,'rudy'),concat('password',id),floor(rand()*1000) from nums where id<=10000000;
create index idx_ref_id on t2(ref_id);



mysql> select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id  order by id limit 800,10;
+-----+---------+-------------+--------------+
| id  | name    | password    | count(t1.id) |
+-----+---------+-------------+--------------+
| 801 | 801rudy | password801 |         4239 |
| 802 | 802rudy | password802 |         4300 |
| 803 | 803rudy | password803 |         4157 |
| 804 | 804rudy | password804 |         4083 |
| 805 | 805rudy | password805 |         4251 |
| 806 | 806rudy | password806 |         4122 |
| 807 | 807rudy | password807 |         4251 |
| 808 | 808rudy | password808 |         4216 |
| 809 | 809rudy | password809 |         4161 |
| 810 | 810rudy | password810 |         4199 |
+-----+---------+-------------+--------------+
10 rows in set (1.32 sec)
--sql执行查询1.3s,通过执行计划可知,mysql选择了正确的执行计划
mysql> explain select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id  order by id limit 800,10;
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| id | select_type | table | partitions | type  | possible_keys | key        | key_len | ref        | rows | filtered | Extra       |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
|  1 | SIMPLE      | t1    | NULL       | index | PRIMARY       | PRIMARY    | 4       | NULL       |    1 |   100.00 | NULL        |
|  1 | SIMPLE      | t2    | NULL       | ref   | idx_ref_id    | idx_ref_id | 5       | test.t1.id | 4173 |   100.00 | Using index |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
2 rows in set, 1 warning (0.00 sec)


mysql>  select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id limit 800,10;
+-----+---------+-------------+------+
| id  | name    | password    | cnt  |
+-----+---------+-------------+------+
| 801 | 801rudy | password801 | 4239 |
| 802 | 802rudy | password802 | 4300 |
| 803 | 803rudy | password803 | 4157 |
| 804 | 804rudy | password804 | 4083 |
| 805 | 805rudy | password805 | 4251 |
| 806 | 806rudy | password806 | 4122 |
| 807 | 807rudy | password807 | 4251 |
| 808 | 808rudy | password808 | 4216 |
| 809 | 809rudy | password809 | 4161 |
| 810 | 810rudy | password810 | 4199 |
+-----+---------+-------------+------+
10 rows in set (0.02 sec)
--sql执行查询0.02s
--通过执行计划可知,mysql使用了DEPENDENT SUBQUERY,但为什么执行时间更短呐
--这是因为mysql先对t1表进行排序取出满足条件的10条数据,而后nested loop对子查询t2表join,在t1很少的数据量的情况下,此时的子查询性能更好
mysql> explain  select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id limit 800,10;
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| id | select_type        | table | partitions | type  | possible_keys | key        | key_len | ref        | rows | filtered | Extra       |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
|  1 | PRIMARY            | t1    | NULL       | index | NULL          | PRIMARY    | 4       | NULL       |  810 |   100.00 | NULL        |
|  2 | DEPENDENT SUBQUERY | t2    | NULL       | ref   | idx_ref_id    | idx_ref_id | 5       | test.t1.id | 4173 |   100.00 | Using index |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
2 rows in set, 2 warnings (0.00 sec)


mysql> select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select id,name,password from t1 order by id limit 800,10) t1;
+-----+---------+-------------+------+
| id  | name    | password    | cnt  |
+-----+---------+-------------+------+
| 801 | 801rudy | password801 | 4239 |
| 802 | 802rudy | password802 | 4300 |
| 803 | 803rudy | password803 | 4157 |
| 804 | 804rudy | password804 | 4083 |
| 805 | 805rudy | password805 | 4251 |
| 806 | 806rudy | password806 | 4122 |
| 807 | 807rudy | password807 | 4251 |
| 808 | 808rudy | password808 | 4216 |
| 809 | 809rudy | password809 | 4161 |
| 810 | 810rudy | password810 | 4199 |
+-----+---------+-------------+------+
10 rows in set (0.02 sec)
--sql执行时间0.02s,执行原理与上一个sql相同,但比上一个sql多了一步DERIVED操作,稍微弱于上一个查询
mysql> explain select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select id,name,password from t1 order by id limit 800,10) t1;
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+------+----------+-------------+
| id | select_type        | table      | partitions | type  | possible_keys | key        | key_len | ref   | rows | filtered | Extra       |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+------+----------+-------------+
|  1 | PRIMARY            | <derived3> | NULL       | ALL   | NULL          | NULL       | NULL    | NULL  |  810 |   100.00 | NULL        |
|  3 | DERIVED            | t1         | NULL       | index | NULL          | PRIMARY    | 4       | NULL  |  810 |   100.00 | NULL        |
|  2 | DEPENDENT SUBQUERY | t2         | NULL       | ref   | idx_ref_id    | idx_ref_id | 5       | t1.id | 4173 |   100.00 | Using index |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+------+----------+-------------+
3 rows in set, 2 warnings (0.00 sec)


--当需要有大量数据要返回时,mysql执行计划不变,因为其只能使用nested loop表连接方式,所以此时原先的第二个sql查询已经不占用优势
--此时这个sql的执行效率更高
mysql> explain select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id  order by id limit 800,10000000;
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| id | select_type | table | partitions | type  | possible_keys | key        | key_len | ref        | rows | filtered | Extra       |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
|  1 | SIMPLE      | t1    | NULL       | index | PRIMARY       | PRIMARY    | 4       | NULL       |   24 |   100.00 | NULL        |
|  1 | SIMPLE      | t2    | NULL       | ref   | idx_ref_id    | idx_ref_id | 5       | test.t1.id | 4173 |   100.00 | Using index |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
2 rows in set, 1 warning (0.00 sec)

--此sql查询需要消耗更多的时间与资源
mysql> explain  select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id limit 800,10000000;
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+--------+----------+-------------+
| id | select_type        | table | partitions | type  | possible_keys | key        | key_len | ref        | rows   | filtered | Extra       |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+--------+----------+-------------+
|  1 | PRIMARY            | t1    | NULL       | index | NULL          | PRIMARY    | 4       | NULL       | 100292 |   100.00 | NULL        |
|  2 | DEPENDENT SUBQUERY | t2    | NULL       | ref   | idx_ref_id    | idx_ref_id | 5       | test.t1.id |   4173 |   100.00 | Using index |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+--------+----------+-------------+
2 rows in set, 2 warnings (0.00 sec)
--此sql查询需要消耗更多的时间与资源
mysql> explain select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select id,name,password from t1 order by id limit 800,10000000) t1;
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+--------+----------+-------------+
| id | select_type        | table      | partitions | type  | possible_keys | key        | key_len | ref   | rows   | filtered | Extra       |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+--------+----------+-------------+
|  1 | PRIMARY            | <derived3> | NULL       | ALL   | NULL          | NULL       | NULL    | NULL  | 100292 |   100.00 | NULL        |
|  3 | DERIVED            | t1         | NULL       | index | NULL          | PRIMARY    | 4       | NULL  | 100292 |   100.00 | NULL        |
|  2 | DEPENDENT SUBQUERY | t2         | NULL       | ref   | idx_ref_id    | idx_ref_id | 5       | t1.id |   4173 |   100.00 | Using index |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+--------+----------+-------------+
3 rows in set, 2 warnings (0.00 sec)

上一篇:C++第14周项目2—— 成绩处理


下一篇:MySQL分布式事务(XA事务)