下次,将您的数据粘贴为文本,这样我们就不必再输入了...
你是这个意思吗?我更喜欢“无限日期”而不是最后一个日期的 NULL 值 - 我更喜欢“会话 id”而不是“岛标识符”,它们通常在点击流和物联网分析中被称为...
WITH
indata(userid,email,used_dt) AS (
SELECT 1,'someone@gmail.com' , DATE '2020-08-28'
UNION ALL SELECT 1,'someone@gmail.com' , DATE '2020-08-29'
UNION ALL SELECT 1,'someone@gmail.com' , DATE '2020-08-30'
UNION ALL SELECT 1,'someone@gmail.com' , DATE '2020-08-31'
UNION ALL SELECT 1,'someone1@gmail.com', DATE '2020-09-03'
UNION ALL SELECT 1,'someone1@gmail.com', DATE '2020-09-05'
UNION ALL SELECT 1,'someone1@gmail.com', DATE '2020-09-07'
UNION ALL SELECT 1,'someone@gmail.com', DATE '2020-09-09'
UNION ALL SELECT 2,'bob@gmail.com' , DATE '2019-07-12'
UNION ALL SELECT 3,'alice@newmail.com' , DATE '2020-08-08'
)
,
with_change_counter AS (
SELECT
userid
, email
, used_dt AS used_from_dt
, CASE
WHEN LAG(email,1,'') OVER(
PARTITION BY userid ORDER BY used_dt
) <> email
THEN 1
ELSE 0
END AS counter
, LEAD(used_dt,1,'9999-12-31') OVER(
PARTITION BY userid ORDER BY used_dt
) AS used_until_dt
FROM indata
)
,with_sess_id AS (
SELECT
userid
, email
, used_from_dt
, used_until_dt
, SUM(counter) OVER(PARTITION BY userid ORDER BY used_from_dt) AS sessid
, counter
FROM with_change_counter
)
SELECT
userid
, MAX(email) AS email
, MIN(used_from_dt) AS email_start_date
, MAX(used_until_dt) AS email_end_date
FROM with_sess_id
GROUP BY
sessid
, userid
ORDER BY
userid
, sessid
, email
;
-- out userid | email | email_start_date | email_end_date
-- out --------+--------------------+------------------+----------------
-- out 1 | someone@gmail.com | 2020-08-28 | 2020-09-03
-- out 1 | someone1@gmail.com | 2020-09-03 | 2020-09-09
-- out 1 | someone@gmail.com | 2020-09-09 | 9999-12-31
-- out 2 | bob@gmail.com | 2019-07-12 | 9999-12-31
-- out 3 | alice@newmail.com | 2020-08-08 | 9999-12-31