sql统计区间分布
Overall_distribution = spark.sql("""
select count(*) as total_,
round(count(case when Completion_rate >= 0 and Completion_rate < 0.25 then 1 end )/count(*),2) as a,
round(count(case when Completion_rate >= 0.25 and Completion_rate < 0.5 then 2 end )/count(*),2) as b,
round(count(case when Completion_rate >= 0.5 and Completion_rate < 0.75 then 3 end )/count(*),2) as c,
round(count(case when Completion_rate >= 0.75 and Completion_rate <= 1 then 4 end )/count(*),2) as d
from parsed_df2
""")
'''
+------+----+----+----+----+
|total_| a| b| c| d|
+------+----+----+----+----+
|136173|0.46|0.12|0.07|0.24|
+------+----+----+----+----+
结果如上