Task01c:随机抽样与卡方检验的SQL实现
需要的工具及基础学习内容
- 工具: MySQL【先过一遍书,代码准确性具体还得二期编辑,希望用MySQL】
- 书籍:《数据分析技术 使用SQL和EXCEL工具 第二版》
- 数据集: 数据配套资源
- Task01a:复习SQL的基本知识
- Task01b:统计的基本概念及SQL实现
主要内容
- 抽样
- 假设检验
基于均值的比较
Z分数:测量样本值到期望值之间的距离,以标准差的数量测量。
随机抽样及分层抽样
随机样本是随机的,因此,该样本的统计是分布于平均值周围的。
-- 随机抽样
SELECT t.*
FROM <tab> t
WHERE RAND() < 0.1;
-- 可重复的随机抽样
SELECT t.*
FROM <tab> t
WHERE RAND(1) < 0.1;
-- 伪随机数生成器
WITH t AS (
SELECT t.*,ROW_NUMBER() OVER (ORDER BY col) AS seqnum
FROM <tab> t
)
SELECT t.*
FROM t
WHERE (seqnum*17+57) % 101 <= 10;
-- 分层平衡抽样 订单小于$200的200个订单的散点图。
SELECT OrderDate,
(CASE WHEN PaymentType="AE" THEN TotalPrice END) AS AE,
(CASE WHEN PaymentType="AE" THEN NULL ELSE TotalPrice END) AS NotAE
FROM Orders
WHERE TotalPrice <= 200
ORDER BY RAND()
LIMIT 200;
虚拟假设及可信度
-- 有多少客户仍然是活跃客户以及停止率
SELECT COUNT(*) AS numstarts,
SUM(CASE WHEN Stoptype IS NOT NULL THEN 1 ELSE 0 END) AS numstops,
AVG(CASE WHEN Stoptype IS NOT NULL THEN 1.0 ELSE 0 END) AS stoprate,
FROM Subscribers
WHERE Startdate = "2005-12-28"
-- 1. 给定数量,停止概率是多少?
-- 2. 给定概率,停止者的数量是多少?
概率和标准差、置信区间
-- 标准差以及置信区间为95%的下限和上限
SELECT stoprate-1.96*stderr AS conflower,
stoprate+1.96*stderr AS confupper,
stoprate,stderr,numstarts,numstops
FROM (
SELECT SQRT(stoprate*(1-stoprate)/numstarts) AS stderr,
stoprate,numstarts,numstops
FROM (SELECT COUNT(*) AS numstarts,
SUM(CASE WHEN Stoptype IS NOT NULL THEN 1 ELSE 0 END) AS numstops,
AVG(CASE WHEN Stoptype IS NOT NULL THEN 1.0 ELSE 0 END) AS stoprate
FROM Subscribers
WHERE startdate = "2005-12-28") s) s
卡方检验 比较两者之间的多个维度的区别。严格地讲,偏差是由于偶然导致的可能性是多少?
如果可能性很低,我们就能很自信地认为市场之间是有区别的。
卡方检验最基本的思想就是通过观察实际值与理论值的偏差来确定理论的正确与否。
实际工作中我们的理论原假设是:假设某特征(如性别、年龄)分布与目标值(是否流失)的分布相互独立,用通俗的话来解释就是“用户是不是流失与他的性别无关”。
当我们手头有一堆样本数据后,可以通过观察数据通过卡方值判断“原假设”是否成立。如果不成立,就要推翻原假设,证明实际情况是“用户流失跟性别可能有关系”。卡方值查P值,若P<0.05则拒绝原假设,支持性别和是否流失可能相互不独立;否则支持原假设,性别和是否流失相对独立,不存在关联。
- 卡方:使用偏移值的平方除以期望值,整个表的卡方值是所有卡方值的和。卡方分布:表的*度:(行数-1 )*(列数-1)
-- SQL中的卡方检验 不同市场之间是否有区别
SELECT market,isstopped,val,x,SQUARE(val-x)/x AS chisquare
FROM
(SELECT cells.market,cells.isstopped,(1.0*r.cnt*c.cnt/(SELECT COUNT(*) FROM subscribers WHERE startdate IN ("2005-12-26"))) AS x,cells.cnt AS val
FROM
(SELECT Market,(CASE WHEN Stoptype IS NOT NULL THEN 1 ELSE 0 END) AS isstopped,COUNT(*) AS cnt
FROM Subscribers
WHERE Startdate IN ("2005-12-26")
GROUP BY Market,(CASE WHEN Stoptype IS NOT NULL THEN 1 ELSE 0 END)) cells LEFT JOIN
(SELECT Market,COUNT(*) AS cnt
FROM Subscribers
WHERE startdate IN ("2005-12-26")
GROUP BY Market) r
ON cells.market = r.market LEFT JOIN
(SELECT (CASE WHEN stoptype IS NOT NULL THEN 1 ELSE 0 END) AS isstopped,COUNT(*) AS cnt
FROM Subscribers
WHERE startdate IN ("2005-12-26")
GROUP BY (CASE WHEN stoptype IS NOT NULL THEN 1 ELSE 0 END)) c
ON cells.isstopped = c.isstopped) a
ORDER BY market,isstopped
-- 对产品的偏好与地域相关吗? 产品组和州的组合关系
SELECT state,GroupName,val,exp,SQUARE(val-expx) / expx AS chisquare
FROM (
SELECT cells.state,cells.GroupName,1.0*r.cnt*c.cnt / (SELECT COUNT(DISTINCT CustomerId) FROM Orders) AS expx,
cells.cnt AS val
FROM
(SELECT o.State,p.GroupName,COUNT(DISTINCT o.CustomerId) AS cnt
FROM Orders o LEFT JOIN OrderLines ol
ON o.OrderId = ol.OrderId LEFT JOIN Products p
ON ol.ProductId = p.ProductId
GROUP BY o.state,p.GroupName) cells LEFT JOIN
(SELECT o.state,COUNT(DISTINCT o.CustomerID) AS cnt
FROM Orders o
GROUP BY o.state) r
ON cells.State = r.State LEFT JOIN
(SELECT p.GroupName,COUNT(DISTINCT o.CustomerId) AS cnt
FROM Orders o LEFT JOIN OrderLines ol
ON o.OrderId = ol.OrderId LEFT JOIN Products p
ON ol.ProductId = p.ProductId
GROUP BY p.GroupName) c
ON cells.GroupName = c.GroupName) a
ORDER BY chisquare DESC;
多维卡方 月份和支付类型与不同产品类型的特殊关系
WITH pmg AS (-- 从支付类型、月份、组名聚合计算单元格值
SELECT o.PaymentType,MONTH(o.OrderDate) AS mon,p.GroupName,COUNT(*) AS cnt
FROM Orders o JOIN OrderLines ol
ON o.OrderId = ol.OrderId JOIN Products p
ON ol.ProductId = p.ProductId
GROUP BY o.PaymentType,Month(o.OrderDate),p.GroupName),
pmgmarg AS (-- 计算每一个维度的总和
SELECT pmg.*,
SUM(cnt) OVER (PARTITION BY paymentType) AS cnt_pt,
SUM(cnt) OVER (PARTITION BY mon) AS cnt_mon,
SUM(cnt) OVER (PARTITION BY GroupName) SA cnt_gn,
SUM(cnt) OVER () AS cnt_all
FROM pmg
),
pmgexp AS (-- 计算期望值
SELECT pmgmarg.*,
(cnt_pt*cnt_mon*cnt_gn)/POWER(cnt_all,2) AS ExpectedValue
FROM pmgmarg
)
-- 计算卡方值
SELECT pmgexp.*,
SQUARE(cnt-ExpectedValue)/ExpectedValue AS chi2
FROM pmgexp
ORDER BY chi2 DESC;