-- 今天见到一条sql,大致意思为:A 表 left join B 表,要查出A表所有的数据,以及统计所有A表与B表相关行数
create table t1 (id int , name varchar(50),password varchar(50));
insert into t1 select id,concat(id,'rudy'),concat('password',id) from generate_series(1,100000) id;
alter table t1 add primary key(id);
create table t2 (id int , name varchar(50),password varchar(50),ref_id int);
insert into t2 select id,concat(id,'rudy'),concat('password',id),trunc(random()*1000) from generate_series(1,10000000) id;
alter table t2 add primary key(id);
create index on t2(ref_id);
--查询结果类似如下
postgres=# select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id order by id offset 800 limit 10;
id | name | password | count
-----+---------+-------------+-------
801 | 801rudy | password801 | 10119
802 | 802rudy | password802 | 9933
803 | 803rudy | password803 | 10011
804 | 804rudy | password804 | 9742
805 | 805rudy | password805 | 9990
806 | 806rudy | password806 | 10024
807 | 807rudy | password807 | 9806
808 | 808rudy | password808 | 10103
809 | 809rudy | password809 | 9855
810 | 810rudy | password810 | 9915
(10 rows)
Time: 1719.877 ms
--需要1.7s
--有没有办法优化呐?sql执行计划如下
postgres=# explain
postgres-# select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id order by id offset 800 limit 10;
QUERY PLAN
----------------------------------------------------------------------------------------------------------
Limit (cost=3824.83..3868.72 rows=10 width=26)
-> GroupAggregate (cost=313.12..439276.63 rows=100000 width=26)
-> Merge Left Join (cost=313.12..388277.26 rows=9999874 width=26)
Merge Cond: (t1.id = t2.ref_id)
-> Index Scan using t1_pkey on t1 (cost=0.29..3342.29 rows=100000 width=26)
-> Index Only Scan using t2_ref_id_idx on t2 (cost=0.43..259686.54 rows=9999874 width=4)
--发现sql过早了关联t2表,而且需要排序(merge操作)
--因为t1只需要返回10条数据(则可以延迟关联t2表),如果sql的执行计划是下面的步骤会更好
1. 对t1表执行order 排序
2. 对t1表取出offset 800 limit 10 的行
3. 拿t1表中的10行与t2 join 计算出count
--sql修改如下,去掉groupr操作,把t2表关联变成子查询
postgres=# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id offset 800 limit 10;
id | name | password | cnt
-----+---------+-------------+-------
801 | 801rudy | password801 | 10119
802 | 802rudy | password802 | 9933
803 | 803rudy | password803 | 10011
804 | 804rudy | password804 | 9742
805 | 805rudy | password805 | 9990
806 | 806rudy | password806 | 10024
807 | 807rudy | password807 | 9806
808 | 808rudy | password808 | 10103
809 | 809rudy | password809 | 9855
810 | 810rudy | password810 | 9915
(10 rows)
Time: 1048.731 ms
--sql执行需要1s
--通过执行计划可知,没有了排序操作,但是需要很早的关联了t2表
postgres=# explain
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id offset 800 limit 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------
Limit (cost=249983.03..253107.81 rows=10 width=26)
-> Index Scan using t1_pkey on t1 (cost=0.29..31247842.29 rows=100000 width=26)
SubPlan 1
-> Aggregate (cost=312.44..312.44 rows=1 width=0)
-> Index Only Scan using t2_ref_id_idx on t2 (cost=0.43..287.44 rows=10000 width=0)
Index Cond: (ref_id = t1.id)
(6 rows)
--sql修改成,把t1表嵌套成一层,先返回t1表的数据
postgres=# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select * from t1 order by id offset 800 limit 10) as t1 ;
id | name | password | cnt
-----+---------+-------------+-------
801 | 801rudy | password801 | 10119
802 | 802rudy | password802 | 9933
803 | 803rudy | password803 | 10011
804 | 804rudy | password804 | 9742
805 | 805rudy | password805 | 9990
806 | 806rudy | password806 | 10024
807 | 807rudy | password807 | 9806
808 | 808rudy | password808 | 10103
809 | 809rudy | password809 | 9855
810 | 810rudy | password810 | 9915
(10 rows)
Time: 111.015 ms
--执行时间0.1s
--通过执行计划可知,延迟关联t2表,对t2表的循环只需要10次
postgres=# explain
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select * from t1 order by id offset 800 limit 10) as t1 ;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Subquery Scan on t1 (cost=27.03..3151.91 rows=10 width=26)
-> Limit (cost=27.03..27.36 rows=10 width=26)
-> Index Scan using t1_pkey on t1 t1_1 (cost=0.29..3342.29 rows=100000 width=26)
SubPlan 1
-> Aggregate (cost=312.44..312.44 rows=1 width=0)
-> Index Only Scan using t2_ref_id_idx on t2 (cost=0.43..287.44 rows=10000 width=0)
Index Cond: (ref_id = t1.id)
Time: 0.682 ms
--那是不是以上sql查询中第三个sql是最优化的呐,答案是不一定
--假如sql需要返回大量的数据时,比如我们把limit条件去掉
-- cost 消耗 31247842
postgres=# explain
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id offset 800 ;
QUERY PLAN
-------------------------------------------------------------------------------------------------------
Limit (cost=249983.03..31247842.29 rows=99200 width=26)
-> Index Scan using t1_pkey on t1 (cost=0.29..31247842.29 rows=100000 width=26)
SubPlan 1
-> Aggregate (cost=312.44..312.44 rows=1 width=0)
-> Index Only Scan using t2_ref_id_idx on t2 (cost=0.43..287.44 rows=10000 width=0)
Index Cond: (ref_id = t1.id)
(6 rows)
Time: 0.790 ms
-- cost 消耗 439276
postgres=# explain
postgres-# select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id order by id offset 800 ;
QUERY PLAN
----------------------------------------------------------------------------------------------------------
Limit (cost=3824.83..439276.63 rows=99200 width=26)
-> GroupAggregate (cost=313.12..439276.63 rows=100000 width=26)
-> Merge Left Join (cost=313.12..388277.26 rows=9999874 width=26)
Merge Cond: (t1.id = t2.ref_id)
-> Index Scan using t1_pkey on t1 (cost=0.29..3342.29 rows=100000 width=26)
-> Index Only Scan using t2_ref_id_idx on t2 (cost=0.43..259686.54 rows=9999874 width=4)
(6 rows)
Time: 0.856 ms
-- cost 消耗 30998878
postgres=# explain
postgres-# select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select * from t1 order by id offset 800 ) as t1 ;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Subquery Scan on t1 (cost=27.03..30998878.29 rows=99200 width=26)
-> Limit (cost=27.03..3342.29 rows=99200 width=26)
-> Index Scan using t1_pkey on t1 t1_1 (cost=0.29..3342.29 rows=100000 width=26)
SubPlan 1
-> Aggregate (cost=312.44..312.44 rows=1 width=0)
-> Index Only Scan using t2_ref_id_idx on t2 (cost=0.43..287.44 rows=10000 width=0)
Index Cond: (ref_id = t1.id)
(7 rows)
--通过执行计划可知,此时sql效率最好的又变成了第2条sql查询
--因为很显然在需要大量行返回时,pg先把数据排序好,再通过merge join选出符合条件的数据,远比第一条sql或第3条sql通过循环子查询要好(子查询要循环99200行,如果t1表有1000000行数据时,循环子查询将是灾难)
--还是要感谢pg提供了hash join,merge join,nested loop 三种表连接
--同样的情况对于只有nested loop表连接的sql是什么样的呐
--构造测试数据如下
create table t1 (id int primary key, name varchar(50),password varchar(50));
insert into t1 select id,concat(id,'rudy'),concat('password',id) from nums where id<=100000;
create table t2 (id int primary key, name varchar(50),password varchar(50),ref_id int);
insert into t2 select id,concat(id,'rudy'),concat('password',id),floor(rand()*1000) from nums where id<=10000000;
create index idx_ref_id on t2(ref_id);
mysql> select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id order by id limit 800,10;
+-----+---------+-------------+--------------+
| id | name | password | count(t1.id) |
+-----+---------+-------------+--------------+
| 801 | 801rudy | password801 | 4239 |
| 802 | 802rudy | password802 | 4300 |
| 803 | 803rudy | password803 | 4157 |
| 804 | 804rudy | password804 | 4083 |
| 805 | 805rudy | password805 | 4251 |
| 806 | 806rudy | password806 | 4122 |
| 807 | 807rudy | password807 | 4251 |
| 808 | 808rudy | password808 | 4216 |
| 809 | 809rudy | password809 | 4161 |
| 810 | 810rudy | password810 | 4199 |
+-----+---------+-------------+--------------+
10 rows in set (1.32 sec)
--sql执行查询1.3s,通过执行计划可知,mysql选择了正确的执行计划
mysql> explain select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id order by id limit 800,10;
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| 1 | SIMPLE | t1 | NULL | index | PRIMARY | PRIMARY | 4 | NULL | 1 | 100.00 | NULL |
| 1 | SIMPLE | t2 | NULL | ref | idx_ref_id | idx_ref_id | 5 | test.t1.id | 4173 | 100.00 | Using index |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
2 rows in set, 1 warning (0.00 sec)
mysql> select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id limit 800,10;
+-----+---------+-------------+------+
| id | name | password | cnt |
+-----+---------+-------------+------+
| 801 | 801rudy | password801 | 4239 |
| 802 | 802rudy | password802 | 4300 |
| 803 | 803rudy | password803 | 4157 |
| 804 | 804rudy | password804 | 4083 |
| 805 | 805rudy | password805 | 4251 |
| 806 | 806rudy | password806 | 4122 |
| 807 | 807rudy | password807 | 4251 |
| 808 | 808rudy | password808 | 4216 |
| 809 | 809rudy | password809 | 4161 |
| 810 | 810rudy | password810 | 4199 |
+-----+---------+-------------+------+
10 rows in set (0.02 sec)
--sql执行查询0.02s
--通过执行计划可知,mysql使用了DEPENDENT SUBQUERY,但为什么执行时间更短呐
--这是因为mysql先对t1表进行排序取出满足条件的10条数据,而后nested loop对子查询t2表join,在t1很少的数据量的情况下,此时的子查询性能更好
mysql> explain select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id limit 800,10;
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| 1 | PRIMARY | t1 | NULL | index | NULL | PRIMARY | 4 | NULL | 810 | 100.00 | NULL |
| 2 | DEPENDENT SUBQUERY | t2 | NULL | ref | idx_ref_id | idx_ref_id | 5 | test.t1.id | 4173 | 100.00 | Using index |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
2 rows in set, 2 warnings (0.00 sec)
mysql> select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select id,name,password from t1 order by id limit 800,10) t1;
+-----+---------+-------------+------+
| id | name | password | cnt |
+-----+---------+-------------+------+
| 801 | 801rudy | password801 | 4239 |
| 802 | 802rudy | password802 | 4300 |
| 803 | 803rudy | password803 | 4157 |
| 804 | 804rudy | password804 | 4083 |
| 805 | 805rudy | password805 | 4251 |
| 806 | 806rudy | password806 | 4122 |
| 807 | 807rudy | password807 | 4251 |
| 808 | 808rudy | password808 | 4216 |
| 809 | 809rudy | password809 | 4161 |
| 810 | 810rudy | password810 | 4199 |
+-----+---------+-------------+------+
10 rows in set (0.02 sec)
--sql执行时间0.02s,执行原理与上一个sql相同,但比上一个sql多了一步DERIVED操作,稍微弱于上一个查询
mysql> explain select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select id,name,password from t1 order by id limit 800,10) t1;
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+------+----------+-------------+
| 1 | PRIMARY | <derived3> | NULL | ALL | NULL | NULL | NULL | NULL | 810 | 100.00 | NULL |
| 3 | DERIVED | t1 | NULL | index | NULL | PRIMARY | 4 | NULL | 810 | 100.00 | NULL |
| 2 | DEPENDENT SUBQUERY | t2 | NULL | ref | idx_ref_id | idx_ref_id | 5 | t1.id | 4173 | 100.00 | Using index |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+------+----------+-------------+
3 rows in set, 2 warnings (0.00 sec)
--当需要有大量数据要返回时,mysql执行计划不变,因为其只能使用nested loop表连接方式,所以此时原先的第二个sql查询已经不占用优势
--此时这个sql的执行效率更高
mysql> explain select t1.id,t1.name,t1.password,count(t1.id) from t1 left join t2 on t1.id=t2.ref_id group by t1.id order by id limit 800,10000000;
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
| 1 | SIMPLE | t1 | NULL | index | PRIMARY | PRIMARY | 4 | NULL | 24 | 100.00 | NULL |
| 1 | SIMPLE | t2 | NULL | ref | idx_ref_id | idx_ref_id | 5 | test.t1.id | 4173 | 100.00 | Using index |
+----+-------------+-------+------------+-------+---------------+------------+---------+------------+------+----------+-------------+
2 rows in set, 1 warning (0.00 sec)
--此sql查询需要消耗更多的时间与资源
mysql> explain select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from t1 order by id limit 800,10000000;
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+--------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+--------+----------+-------------+
| 1 | PRIMARY | t1 | NULL | index | NULL | PRIMARY | 4 | NULL | 100292 | 100.00 | NULL |
| 2 | DEPENDENT SUBQUERY | t2 | NULL | ref | idx_ref_id | idx_ref_id | 5 | test.t1.id | 4173 | 100.00 | Using index |
+----+--------------------+-------+------------+-------+---------------+------------+---------+------------+--------+----------+-------------+
2 rows in set, 2 warnings (0.00 sec)
--此sql查询需要消耗更多的时间与资源
mysql> explain select id,name,password,(select count(*) from t2 where t1.id=t2.ref_id) as cnt from (select id,name,password from t1 order by id limit 800,10000000) t1;
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+--------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+--------+----------+-------------+
| 1 | PRIMARY | <derived3> | NULL | ALL | NULL | NULL | NULL | NULL | 100292 | 100.00 | NULL |
| 3 | DERIVED | t1 | NULL | index | NULL | PRIMARY | 4 | NULL | 100292 | 100.00 | NULL |
| 2 | DEPENDENT SUBQUERY | t2 | NULL | ref | idx_ref_id | idx_ref_id | 5 | t1.id | 4173 | 100.00 | Using index |
+----+--------------------+------------+------------+-------+---------------+------------+---------+-------+--------+----------+-------------+
3 rows in set, 2 warnings (0.00 sec)