SQL Window Functions
Window function performs after “FROM, JOIN, WHERE, GROUP BY, HAVING”, before “ORDER BY, LIMIT, SELECT DISTINCT”.
Window Function Syntax
fun()
can beAggregate functions
SUM, AVG, MIN, MAX, COUNT
Ranking functions
ROW_NUMBER
: generate sequential numbers within group.e.g. 123456
RANK
: generate numbers not necessarily sequential. Same value has same rank number.e.g. 11345
DENSE_RANK
: same asRANK
but returns sequential values with no gap.e. g. 11234556
NTILE
Analytic functions
LAG
: access to rows before the current rowLEAD
: access to rows after the current row
Need to specify the column name and offset. Offset cannot be a negative value. Can set the default value to be used if a previous / following row does not exist.
OVER()
takes a windowPARTITION BY
: divides the result into partitionsORDER BY
: defines the logical order of the rows within each groupROWS / RANGE
: specifies the starting and end points of a groupBETWEEN ... AND...
UNBOUNDED PRECEDING | PRECEDING | CURRENT ROW
UNBOUNDED FOLLOWING | FOLLOWING | CURRENT ROW
SELECT fun() OVER( PARTITION BY ... ORDER BY ... ROWS ... ) FROM Table;
Example:
ROWS between 5 preceding and current row # current row and previous 5 rows, 6 rows in total ROWS between current row and 5 following # current row and 5 following rows, 6 rows in total ROWS between 5 preceding and 5 folowing # previous 5 rows and 5 following rows, including current row, 11 rows in total
Aggregation Function
LeetCode 1303
Find the Team Size (Easy) [link]
Using LEFT JOIN
.
SELECT e1.employee_id, COUNT(*) AS team_size
FROM Employee e1
LEFT JOIN Employee e2
ON e1.team_id = e2.team_id
GROUP BY e1.employee_id;
Self Join
SELECT employee_id, (
SELECT COUNT(*)
FROM employee e2
WHERE e1.team_id = e2.team_id
) AS team_size
FROM employee e1;
Subquery
SELECT e.employee_id, c.team_size
FROM employee e, (
SELECT team_id, COUNT(*) AS team_size
FROM employee e
GROUP BY team_id
) AS c
WHERE c.team_id = e.team_id;
Using window functions
SELECT
employee_id,
COUNT(employee_id) OVER(PARTITION BY team_id) AS team_size
FROM Employee;
LeetCode 1126
Active Businesses (Medium) [link]
SELECT e.business_id
FROM Events e
LEFT JOIN (
SELECT event_type, AVG(occurences) as average
FROM Events
GROUP BY event_type
) AS t
ON e.event_type = t.event_type
WHERE e.occurences > t.average
GROUP BY e.business_id
HAVING COUNT(e.event_type) > 1;
Using window function
SELECT business_id
FROM (
SELECT a.*, AVG(occurences) OVER(PARTITION BY event_type) AS average
FROM Events a
) AS t
WHERE occurences > average
GROUP BY business_id
HAVING COUNT(event_type) > 1;
LeetCode 534
Game Play Analysis III (Medium) [link]
Using self join
SELECT a1.player_id, a1.event_date, SUM(a2.games_played) AS games_played_so_far
FROM Activity a1, Activity a2
WHERE a1.player_id = a2.player_id AND a1.event_date >= a2.event_date
GROUP BY a1.player_id, a1.event_date;
Using window function
SELECT player_id, event_date,
SUM(games_played) OVER(PARTITION BY player_id ORDER BY event_date) AS games_played_so_far
FROM Activity;
LeetCode 550
Game Play Analysis IV (Medium) [link]
Using WHERE ... IN
SELECT ROUND(COUNT(DISTINCT player_id) / (SELECT COUNT(DISTINCT player_id) FROM Activity),2) fraction
FROM Activity
WHERE (player_id, event_date) IN (
SELECT player_id, DATE(MIN(event_date) + 1)
FROM Activity
GROUP BY player_id
) ;
Using window function
SELECT ROUND(COUNT(DISTINCT t.player_id) / COUNT(DISTINCT a.player_id), 2) fraction
FROM Activity a, (
SELECT player_id,
DATEDIFF(event_date, MIN(event_date) OVER (PARTITION BY player_id)) AS diff
FROM Activity
) AS t
WHERE t.diff = 1;
LeetCode 1321
Restaurant Growth (Medium) [link]
SELECT visited_on, amount, average_amount
FROM (
SELECT visited_on,
SUM(amo) OVER(ORDER BY visited_on ROWS 6 PRECEDING) AS amount,
ROUND(AVG(amo) OVER(ORDER BY visited_on ROWS 6 PRECEDING),2) AS average_amount
FROM (
SELECT *, SUM(amount) AS amo
FROM Customer
GROUP BY visited_on
) t
) t2
WHERE DATEDIFF(
visited_on, (
SELECT MIN(visited_on) FROM Customer
) ) >= 6;
LeetCode 1867
Orders With Maximum Quantity Above Average (Medium) [link]
Using window function.
SELECT order_id
FROM(
SELECT DISTINCT order_id,
MAX(quantity) > MAX(AVG(quantity)) OVER() AS flag
FROM OrdersDetails
GROUP BY order_id
) t
WHERE flag = 1;
Ranking Functions
LeetCode 185
Department Top Three Salaries (Hard) [link]
Do not include the person where there are more than 3 salaries that are higher than he’s.
SELECT d.Name AS Department, e1.Name AS Employee, e1.Salary
FROM Employee e1
JOIN Department d
ON e1.DepartmentId = d.DepartmentId
WHERE
3 > (
SELECT COUNT(DISTINCT e2.Salary)
FROM Employee e2
WHERE e2.Salary > e1.Salary AND e1.DepartmentId = e2.DepartmentId
)
;
We first get a dense-ranking of employees and then select employees with rank <=3.
SELECT Department, Name AS Employee, Salary
FROM(
SELECT
Employee.Name AS Name,
Employee.Salary AS Salary,
Department.Name AS Department
DENSE_RANK() OVER (PARTITION BY DepartmentId ORDER BY Salary DESC) AS Rank
FROM Employee
JOIN Department
ON Employee.DepartmentId = Department.DepartmentId
) AS t
WHERE Rank <= 3;
Use GROUP BY
to select name and salary which has no more than three higher salaries.
SELECT D.Name as Department, E.Name as Employee, E.Salary
FROM Department D, Employee E, Employee E2
WHERE
D.DepartmentId = E.DepartmentId
AND E.DepartmentId = E2.DepartmentId
AND E.Salary <= E2.Salary
GROUP BY D.DepartmentId, E.Name
HAVING COUNT(DISTINCT E2.Salary) <= 3
ORDER BY D.Name, E.Salary DESC;
Note that JOIN ON
and WHERE
are similar. All three of the following queries produce the same correct result
SELECT *
FROM facebook
JOIN linkedin
ON facebook.name = linkedin.name;
SELECT *
FROM facebook
JOIN linkedin
WHERE facebook.name = linkedin.name;
SELECT *
FROM facebook, linkedin
WHERE facebook.name = linkedin.name;
LeetCode 512
Game Play Analysis II (Easy) [link]
Using WHERE...IN
.
SELECT player_id, device_id
FROM Activity
WHERE (player_id, event_date) IN(
SELECT player_id, MIN(event_date) AS event_date
FROM Activity
GROUP BY player_id
);
# Code below is not correct! We have to include what is used to GROUP BY
# SELECT player_id, device_id
# FROM Activity
# WHERE event_date IN(
# SELECT MIN(event_date) AS event_date
# FROM Activity
# GROUP BY player_id
# );
Using window function
SELECT t.player_id, t.device_id
FROM (
SELECT *,
DENSE_RANK() OVER(PARTITION BY player_id ORDER BY event_date) AS rnk
FROM Activity
) AS t
WHERE rnk = 1;
LeetCode 579
Find Cumulative Salary of an Employee (Hard) [link]
SELECT temp2.Id, temp2.Month, (
SELECT SUM(e2.Salary) FROM Employee e2
WHERE e2.Id = temp2.Id AND e2.Month BETWEEN temp2.Month - 2 AND temp2.Month
ORDER BY e2.Month DESC
LIMIT 3
) AS Salary
FROM (
SELECT e1.Id, e1.Month, e1.Salary
FROM Employee e1, (
SELECT e.Id, MAX(e.Month) AS Month
FROM Employee e
GROUP BY e.Id
) temp1
WHERE e1.Id = temp1.Id AND e1.Month < temp1.Month
ORDER BY e1.Id ASC, e1.Month DESC
) temp2;
This is how to exclude the most recent month
SELECT e1.Id, e1.Month, e1.Salary
FROM Employee e1, (
SELECT e.Id, MAX(e.Month) AS Month
FROM Employee e
GROUP BY e.Id
) temp1
WHERE e1.Id = temp1.Id AND e1.Month < temp1.Month
ORDER BY e1.Id ASC, e1.Month DESC;
Using window function rank()
and RANGE 2 PRECEDING
(faster).
SELECT id, month, Salary
FROM(
SELECT id, month,
SUM(salary) OVER(PARTITION BY id ORDER BY month RANGE 2 PRECEDING) AS Salary,
rank() OVER(PARTITION BY id ORDER BY month DESC) AS rnk
FROM Employee
) t
WHERE rnk > 1
ORDER BY id, month DESC;
LeetCode 613
Short Distance in a Line (Easy) [link]
SELECT MIN(t.diff) AS shortest
FROM(
SELECT ABS(p2.x - p1.x) AS diff
FROM Point p1
JOIN Point p2
ON p1.x != p2.x
) AS t;
SELECT MIN(t.diff) AS shortest
FROM (
SELECT ABS(p1.x - p2.x) AS diff
FROM Point p1, Point p2
WHERE p1.x != p2.x
) t
LeetCode 612
Shortest Distance in a Plane (Medium) [link]
SELECT ROUND(MIN(t.distance), 2) AS shortest
FROM(
SELECT SQRT((p2.x - p1.x) * (p2.x - p1.x) + (p2.y - p1.y) * (p2.y - p1.y)) AS distance
FROM Point2D p1
JOIN Point2D p2
ON p1.x != p2.x OR p1.y != p2.y
) AS t;
SELECT ROUND(MIN(dist),2) AS shortest
FROM(
SELECT SQRT((p2.x-p1.x) * (p2.x-p1.x) + (p2.y-p1.y) * (p2.y-p1.y)) AS dist
FROM Point2D p1, Point2D p2
WHERE p1.x != p2.x OR p1.y != p2.y
) t;
LeetCode 578
Get Highest Answer Rate Question (Medium) [link]
Using window function
SELECT question_id AS survey_log
FROM (
select
a.question_id,
dense_rank() over(order by sum(action = 'answer') / sum(action = 'show') desc, question_id) as rnk
from SurveyLog a
group by question_id
) AS t
WHERE t.rnk = 1;
Using GROUP BY
ratio
SELECT question_id AS survey_log
FROM SurveyLog
GROUP BY question_id
ORDER BY SUM(action = 'answer') / SUM(action = 'show') DESC, question_id
LIMIT 1;
SELECT question_id AS survey_log
FROM(
SELECT question_id, SUM(IF(answer_id is NULL, 0, 1)) / COUNT(question_id) AS rate
FROM SurveyLog
GROUP BY question_id
ORDER BY rate DESC, question_id
) t
LIMIT 1;
LeetCode 574
Winning Candidates (Medium) [link]
Using subquery
SELECT name
FROM Candidate C, (
SELECT candidateId, COUNT(candidateId) as cnt
FROM Vote
GROUP BY candidateId
ORDER BY cnt DESC
LIMIT 1
) AS t
WHERE C.id = t.candidateId;
SELECT name
FROM (
SELECT name, COUNT(name) AS cnt
FROM Vote v
LEFT JOIN Candidate c
ON v.candidateId = c.id
GROUP BY c.name
ORDER BY cnt DESC
LIMIT 1
) t;
LeetCode 602
Friend Requests II: Who Has the Most Friends (Medium) [link]
SELECT t2.friend AS id, t2.cnt AS num
FROM (
SELECT
t.friend, COUNT(t.friend) AS cnt,
dense_rank() OVER(ORDER BY COUNT(t.friend) DESC) AS rnk
FROM (
SELECT requester_id AS friend
FROM RequestAccepted
UNION ALL
SELECT accepter_id
FROM RequestAccepted
) t
GROUP BY t.friend
) t2
WHERE t2.rnk = 1
LeetCode 1112
Highest Grade For Each Student (Medium) [link]
Using subquery
SELECT student_id, MIN(course_id) AS course_id, grade
FROM Enrollments
WHERE (student_id, grade) IN(
SELECT student_id, MAX(grade)
FROM Enrollments
GROUP BY student_id
)
GROUP BY student_id
ORDER BY student_id;
Using window function
SELECT student_id, course_id, grade
FROM (
SELECT *,
dense_rank() OVER(PARTITION BY student_id ORDER BY grade DESC, course_id) AS rnk
FROM Enrollments
) AS t
WHERE t.rnk = 1;
LeetCode 1532
The Most Recent Three Orders (Medium) [link]
SELECT c.name AS customer_name, t.customer_id, t.order_id, t.order_date
FROM(
SELECT *,
dense_rank() OVER(PARTITION BY customer_id ORDER BY order_date DESC) as rnk
FROM Orders
) t
LEFT JOIN Customers c
ON t.customer_id = c.customer_id
WHERE t.rnk <= 3
ORDER BY c.name, t.customer_id, t.order_date DESC;
LeetCode 1596
The Most Frequently Ordered Products for Each Customer (Medium) [link]
Note that in the sub table, COUNT(*)
is grouped by customer_id
and product_id
.
SELECT t.customer_id, t.product_id, p.product_name
FROM(
SELECT customer_id, product_id,
rank() OVER(PARTITION BY customer_id ORDER BY COUNT(*) DESC) AS rnk
FROM Orders
GROUP BY customer_id, product_id
) t
LEFT JOIN Products p
ON t.product_id = p.product_id
WHERE rnk = 1;
LeetCode 1831
Maximum Transaction Each Day (Medium) [link]
Using WHERE IN
SELECT transaction_id
FROM Transactions
WHERE (DATE(day), amount) IN(
SELECT DATE(day), MAX(amount) AS amount
FROM Transactions
GROUP BY DATE(day)
)
ORDER BY transaction_id;
Using window function
SELECT transaction_id
FROM(
SELECT *,
dense_rank() OVER(PARTITION BY DATE(day) ORDER BY amount DESC) as rk
FROM Transactions
) t
WHERE rk = 1
ORDER BY transaction_id;
Analytic Functions
LeetCode 1454
Active Users (Medium) [link]
If the difference between the smallest date and the fifth bigger date is exactly 4, this means the user must have logged in for 5 consecutive days. We can use either LAG
or LEAD
. Remember to use DISTINCT
because one user may login more than 5 consecutive days more than once.
SELECT
DISTINCT t.id, a.name
FROM(
SELECT
id, login_date,
datediff(LEAD(login_date,4) OVER(PARTITION BY id ORDER BY login_date), login_date) AS lag4
FROM Logins
GROUP BY id, login_date
) t
LEFT JOIN Accounts a
ON a.id = t.id
WHERE t.lag4 = 4;
Filter the id and name where there are five days that are consecutive
SELECT id, name FROM Accounts WHERE id IN (
SELECT l1.id FROM Logins l1, Logins l2, Logins l3, Logins l4, Logins l5
WHERE l1.id = l2.id AND l2.id = l3.id AND l3.id = l4.id AND l4.id = l5.id
AND datediff(l2.login_date, l1.login_date) = 1
AND datediff(l3.login_date, l2.login_date) = 1
AND datediff(l4.login_date, l3.login_date) = 1
AND datediff(l5.login_date, l4.login_date) = 1
) ORDER BY id;
Self Join
SELECT DISTINCT a.id, a.name
FROM(
SELECT a.id, a.login_date as ad, b.login_date as bd
FROM logins a
JOIN logins b
ON a.id = b.id AND DATEDIFF(a.login_date, b.login_date) BETWEEN 0 AND 4
GROUP BY a.id, a.login_date
HAVING COUNT(DISTINCT b.login_date) = 5
) AS t
LEFT JOIN accounts a
ON a.id = t.id;
LeetCode 1709
Biggest Window Between Visits (Medium) [link]
Using LEAD
SELECT t.user_id, MAX(DATEDIFF(t.next_date, t.visit_date)) biggest_window
FROM (
SELECT user_id, visit_date,
LEAD(visit_date, 1, '2021-01-01') OVER(PARTITION BY user_id ORDER BY visit_date) AS next_date
FROM UserVisits
) AS t
GROUP BY user_id
ORDER BY user_id;
The result table of the following code is like:
SELECT user_id, visit_date,
LEAD(visit_date, 1, '2021-01-01') OVER(PARTITION BY user_id ORDER BY visit_date) AS next_date
FROM UserVisits;
# +---------+------------+------------+
# | user_id | visit_date | next_date |
# +---------+------------+------------+
# | 1 | 2020-10-20 | 2020-11-28 |
# | 1 | 2020-11-28 | 2020-12-3 |
# | 1 | 2020-12-3 | 2020-10-5 |
# | 2 | 2020-10-5 | 2020-12-9 |
# | 2 | 2020-12-9 | 2020-11-11 |
# | 3 | 2020-11-11 | 2021-01-01 |
# +---------+------------+------------+
# If LEAD(visit_date, 0, '2021-01-01'), the visit_date and next_date are the same
LeetCode 601
Human Traffic of Stadium (Hard) [link]
The method is tricky but useful.
with t1 as (
SELECT *, id - ROW_NUMBER() OVER(ORDER BY id) AS rk
FROM stadium
WHERE people >= 100
)
SELECT id, visit_date, people
FROM t1
WHERE rk IN (
SELECT rk
FROM t1
GROUP BY rk
HAVING COUNT(id) >= 3
)
ORDER BY visit_date;
LeetCode 626
Exchange Seats (Medium) [link]
Using CASE WHEN
SELECT (
CASE
WHEN MOD(id, 2) != 0 AND cnt != id THEN id+1
WHEN MOD(id, 2) != 0 AND cnt = id THEN id
ELSE id-1
END
) AS id, student
FROM Seat, (
SELECT COUNT(*) AS cnt
FROM Seat
) AS t
ORDER BY id;
Use LAG
and LEAD
to keep track of the last and next student name of the current name, the use IF
to decide the name based on odd and even id.
SELECT t.id, IF(t.id % 2 = 0, last, next) AS student
FROM (
SELECT id, student,
lag(student,1,student) OVER() last,
lead(student,1,student) OVER() next
FROM seat
) AS t;