In my current project I need to calculate mean, mode, median and percentiles (95th percentile, 99th percentile) of a given data set.
In this article I've put together a summary of what I've discovered during my research.
The following code snippets are taken from the following blog - please visit it for more detailed information:
http://blogs.lessthandot.com/index.php/DataMgmt/DataDesign/calculating-mean-median-and-mode-with-sq
MEAN Calculation
Mean is another name for average so we can use AVG function to calculate mean.
MEDIAN Calculation
Median is middle point in the set
DECLARE @Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))
INSERT INTO @Temp VALUES(1)
INSERT INTO @Temp VALUES(2)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(7)
INSERT INTO @Temp VALUES(9)
INSERT INTO @Temp VALUES(10)
INSERT INTO @Temp VALUES(NULL)
SELECT ((
SELECT TOP 1 DATA
FROM (
SELECT TOP 50 PERCENT DATA
FROM @Temp
WHERE DATA IS NOT NULL
ORDER BY DATA
) AS A
ORDER BY DATA DESC) +
(
SELECT TOP 1 DATA
FROM (
SELECT TOP 50 PERCENT DATA
FROM @Temp
WHERE DATA IS NOT NULL
ORDER BY DATA DESC
) AS A
ORDER BY DATA ASC)) / 2
--MODE Calculation
DECLARE @Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))
INSERT INTO @Temp VALUES(1)
INSERT INTO @Temp VALUES(2)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(7)
INSERT INTO @Temp VALUES(9)
INSERT INTO @Temp VALUES(10)
INSERT INTO @Temp VALUES(NULL)
SELECT TOP 1 WITH ties DATA
FROM @Temp
WHERE DATA IS Not NULL
GROUP BY DATA
ORDER BY COUNT(*) DESC
Percentile Calculation
Please reference the following blog for details:
http://www.sqlteam.com/article/computing-percentiles-in-sql-server
In this article I've put together a summary of what I've discovered during my research.
The following code snippets are taken from the following blog - please visit it for more detailed information:
http://blogs.lessthandot.com/index.php/DataMgmt/DataDesign/calculating-mean-median-and-mode-with-sq
MEAN Calculation
Mean is another name for average so we can use AVG function to calculate mean.
MEDIAN Calculation
Median is middle point in the set
DECLARE @Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))
INSERT INTO @Temp VALUES(1)
INSERT INTO @Temp VALUES(2)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(7)
INSERT INTO @Temp VALUES(9)
INSERT INTO @Temp VALUES(10)
INSERT INTO @Temp VALUES(NULL)
SELECT ((
SELECT TOP 1 DATA
FROM (
SELECT TOP 50 PERCENT DATA
FROM @Temp
WHERE DATA IS NOT NULL
ORDER BY DATA
) AS A
ORDER BY DATA DESC) +
(
SELECT TOP 1 DATA
FROM (
SELECT TOP 50 PERCENT DATA
FROM @Temp
WHERE DATA IS NOT NULL
ORDER BY DATA DESC
) AS A
ORDER BY DATA ASC)) / 2
--MODE Calculation
DECLARE @Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))
INSERT INTO @Temp VALUES(1)
INSERT INTO @Temp VALUES(2)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(7)
INSERT INTO @Temp VALUES(9)
INSERT INTO @Temp VALUES(10)
INSERT INTO @Temp VALUES(NULL)
SELECT TOP 1 WITH ties DATA
FROM @Temp
WHERE DATA IS Not NULL
GROUP BY DATA
ORDER BY COUNT(*) DESC
Percentile Calculation
Please reference the following blog for details:
http://www.sqlteam.com/article/computing-percentiles-in-sql-server
-- function for floating point division
CREATE FUNCTION dbo.FDIV
(@numerator float,
@denominator float)
RETURNS float
AS
BEGIN
RETURN CASE WHEN @denominator = 0.0
THEN 0.0
ELSE @numerator / @denominator
END
END
GO
-- function for linear interpolation
CREATE FUNCTION dbo.LERP
(@value float, -- between low and high
@low float,
@high float,
@newlow float,
@newhigh float)
RETURNS float -- between newlow and newhigh
AS
BEGIN
RETURN CASE
WHEN @value between @low and @high and @newlow <= @newhigh THEN @newlow + dbo.FDIV((@value-@low), (@high-@low)) * (@newhigh - @newlow)
WHEN @value = @low and @newlow is not NULL THEN @newlow
WHEN @value = @high and @newhigh is not NULL THEN @newhigh
ELSE NULL
END
END
GO
Declare @TestScores table (StudentID int, Score int)
insert @TestScores (StudentID, Score) Values (1, 20)
insert @TestScores (StudentID, Score) Values (2, 03)
insert @TestScores (StudentID, Score) Values (3, 40)
insert @TestScores (StudentID, Score) Values (4, 45)
insert @TestScores (StudentID, Score) Values (5, 50)
insert @TestScores (StudentID, Score) Values (6, 20)
insert @TestScores (StudentID, Score) Values (7, 90)
insert @TestScores (StudentID, Score) Values (8, 20)
insert @TestScores (StudentID, Score) Values (9, 11)
insert @TestScores (StudentID, Score) Values (10, 30)
--Find the percentile rank of a given score
-- The derived table makes one scan over the data values to compute some aggregates. The outer select interpolates
-- between the pth percentile of the nearest samples below and above the given value.
-- The result is 25, 0.5. That means a score of 25 is at the 50th percentile, the median, of the distribution.
declare @val float
set @val = 25
select
@val as val,
dbo.LERP(@val, scoreLT, scoreGE,
dbo.FDIV(countLT-1,countMinus1),
dbo.FDIV(countLT,countMinus1)) as percentrank
from (
select
SUM(CASE WHEN Score < @val
THEN 1 ELSE 0 END) as countLT,
count(*)-1 as countMinus1,
MAX(CASE WHEN Score < @val
THEN Score END) as scoreLT,
MIN(CASE WHEN Score >= @val
THEN Score END) as scoreGE
from @TestScores
) as x1
-- Find the percentile (the score) that characterizes a given percentage
declare @pp float
set @pp = .75
select
@pp as factor,
dbo.LERP(max(d), 0.0, 1.0, max(a.Score), max(b.Score)) as percentile
from
(
select floor(kf) as k, kf-floor(kf) as d
from (
select 1+@pp*(count(*)-1) as kf from @TestScores
) as x1
) as x2
join @TestScores a
on
(select count(*) from @TestScores aa
where aa.Score < a.Score) < k
join @TestScores b
on
(select count(*) from @TestScores bb
where bb.Score < b.Score) < k+1