【发布时间】:2019-12-12 07:39:32
【问题描述】:
我阅读了类似的问题,并且可以通过使用窗口函数使其工作,但是,由于 ClickHouse 似乎不支持它们,我正在寻找替代解决方案。
鉴于 (1, 5), (2, 3), (3, 8), (10, 15) 等时间间隔,我想将重叠的间隔“合并”为单个间隔。在我的示例中,它将是:
(1, 8) 和 (10, 15)。
任何指针表示赞赏!谢谢!
【问题讨论】:
标签: clickhouse
我阅读了类似的问题,并且可以通过使用窗口函数使其工作,但是,由于 ClickHouse 似乎不支持它们,我正在寻找替代解决方案。
鉴于 (1, 5), (2, 3), (3, 8), (10, 15) 等时间间隔,我想将重叠的间隔“合并”为单个间隔。在我的示例中,它将是:
(1, 8) 和 (10, 15)。
任何指针表示赞赏!谢谢!
【问题讨论】:
标签: clickhouse
如果此函数已与任意 lambda 一起使用,arrayReduce 将很容易解决此任务。虽然这不存在,但请尝试通过可用的方式解决问题。
SELECT
intervals,
arraySort(x -> x, intervals) sortedIntervals,
/* try to merge each interval with precede ones */
arrayMap((x, index) -> index != 1
? (arrayReduce(
'min',
arrayMap(
i -> sortedIntervals[i + 1].1,
/* get indexes of intervals that can be merged with the current one (index is zero-based) */
arrayFilter(
i -> x.1 <= sortedIntervals[i + 1].2 AND x.2 >= sortedIntervals[i + 1].1,
range(index)))),
arrayReduce(
'max',
arrayMap(
i -> sortedIntervals[i + 1].2,
/* get indexes of intervals that can be merged with the current one (index is zero-based) */
arrayFilter(
i -> x.1 <= sortedIntervals[i + 1].2 AND x.2 >= sortedIntervals[i + 1].1,
range(index)))))
: x,
sortedIntervals,
arrayEnumerate(sortedIntervals)) rawResult,
/* filter out intervals nested to other ones */
arrayFilter(
(x, index) -> index == length(rawResult) OR x.1 != rawResult[index + 1].1,
rawResult,
arrayEnumerate(rawResult)) result
FROM
(
SELECT [(1, 5), (2, 3), (3, 8), (10, 15)] intervals
UNION ALL
SELECT [(2, 4), (1, 3), (3, 6), (12, 14), (7, 7), (13, 16), (9, 9), (8, 9), (10, 15)]
UNION ALL
SELECT [(20, 22), (18, 18), (16, 21), (1, 8), (2, 9), (3, 5), (10, 12), (11, 13), (14, 15)]
UNION ALL
SELECT []
UNION ALL
SELECT [(1, 11)]
)
FORMAT Vertical;
/*
Row 1:
──────
intervals: [(2,4),(1,3),(3,6),(12,14),(7,7),(13,16),(9,9),(8,9),(10,15)]
sortedIntervals: [(1,3),(2,4),(3,6),(7,7),(8,9),(9,9),(10,15),(12,14),(13,16)]
rawResult: [(1,3),(1,4),(1,6),(7,7),(8,9),(8,9),(10,15),(10,15),(10,16)]
result: [(1,6),(7,7),(8,9),(10,16)]
Row 2:
──────
intervals: [(1,5),(2,3),(3,8),(10,15)]
sortedIntervals: [(1,5),(2,3),(3,8),(10,15)]
rawResult: [(1,5),(1,5),(1,8),(10,15)]
result: [(1,8),(10,15)]
Row 3:
──────
intervals: [(20,22),(18,18),(16,21),(1,8),(2,9),(3,5),(10,12),(11,13),(14,15)]
sortedIntervals: [(1,8),(2,9),(3,5),(10,12),(11,13),(14,15),(16,21),(18,18),(20,22)]
rawResult: [(1,8),(1,9),(1,9),(10,12),(10,13),(14,15),(16,21),(16,21),(16,22)]
result: [(1,9),(10,13),(14,15),(16,22)]
Row 4:
──────
intervals: []
sortedIntervals: []
rawResult: []
result: []
Row 5:
──────
intervals: [(1,11)]
sortedIntervals: [(1,11)]
rawResult: [(1,11)]
result: [(1,11)]
*/
【讨论】:
SELECT arraySplit((x, y) -> y, sortedDisArray, arrayMap(x -> if(x <= 1, 0, 1), arrayDifference(sortedDisArray))) AS z,
arrayMap(el -> (arrayReduce('min', el), arrayReduce('max', el)), z) as result
FROM
(
SELECT [(1, 5), (2, 3), (3, 8), (10, 15)] AS a,
arrayMap(x -> x.2 -x.1 ,a) as durations,
arrayMap((i, y) ->
arrayMap(n -> n + a[y].1 ,
range(toUInt64(durations[y]+1))) , durations, arrayEnumerate(a)) as disArray,
arraySort(arrayDistinct(flatten(disArray))) as sortedDisArray
)
【讨论】:
我遇到了同样的问题,不幸的是,这里的两个答案都不适合我。 first one 有错误。试试下面的例子:
[(1, 5), (2, 3), (3, 8), (6, 9), (11, 15)]
它会让你回来
[(1, 8), (3, 9), (11, 15)]
second one 是正确的,但它需要将所有区间转换为点列表。就我而言,我有一个日期时间间隔,所以我必须创建一个非常大的数组,其中包含间隔中每一秒的值。会很重。
这就是我试图找到解决方案的原因,我希望我做到了。
SELECT [ (1,5), (3, 7), (5, 10), (6,7), (7,8), (9, 12), (15, 20), (16, 22) ] AS sortedIntervals,
-- collect a new array with finishes of all intervals
arrayMap(i -> i.2, sortedIntervals) as finishes,
-- create an array with cumulative finishes (merge small internal intervals and increase finish)
arrayMap( (i, y) -> arrayReduce('max', arraySlice(finishes, 1, y)), finishes, arrayEnumerate(finishes)) AS cumulative_finish,
-- get sequence number of intervals, that will generate a new intervals (which will not be merged)
arrayFilter ( i -> i > 0 ,
arrayMap( (i, y) ->
-- there is no check for element existing, because array[0] will return 0
-- if start of current interval is bigger then cumulative finish of previous one - it's a new interval
( i.1 > cumulative_finish[y-1] )
? y
: 0, sortedIntervals, arrayEnumerate(sortedIntervals))
) AS new_interval_indexes,
-- get result's intervals.
-- the first start - it's our initial start, finish - it's a value from cumulative finish of previous for new start interval
arrayMap( (i, n ) ->
-- there is no checking on existing element, because array[<unexisting_element>] return 0, array[-1] return last element, that we want
( sortedIntervals[i].1, cumulative_finish[new_interval_indexes[n+1]-1] ) ,
new_interval_indexes, arrayEnumerate(new_interval_indexes)
) as result
它会返回
[(1, 12), (15, 22)]
【讨论】:
另一种测试解决方案:
select
id,
data.1 as period_group,
min(data.2) as start,
max(data.3) as end,
end - start as diff
from
( select
id,
groupArray((start, end)) as arr,
arraySort(
y -> y.1,
arrayFilter(
x ->
not(arrayExists(
a ->
a.1 <= x.1
and a.2 > x.2
or a.1 < x.1
and a.2 >= x.2,
arr)),
arr)) as arr_clear,
arrayMap(
x, y -> (x, y),
arr_clear,
arrayPushFront(
arrayPopBack(arr_clear),
arr_clear[1])) as arr_w_prev,
arrayMap(
x, y -> (y, x.1, x.2),
arr_clear,
arrayCumSum(
x -> x.1.1 > x.2.2,
arr_w_prev)) as arr_groups
from
values (
'id String, start UInt8, end UInt8',
('a', 1, 10),
('a', 2, 8),
('a', 7, 15),
('a', 20, 25),
('b', 116, 130),
('b', 111, 114),
('b', 107, 109),
('b', 103, 105),
('b', 100, 120),
('c', 50, 60),
('c', 50, 70),
('d', 1, 5),
('d', 2, 3),
('d', 3, 8),
('d', 6, 9),
('d', 11, 15),
('e', 1, 5),
('e', 3, 7),
('e', 5, 10),
('e', 6, 7),
('e', 7, 8),
('e', 9, 12),
('e', 15, 20),
('e', 16, 22),
('f', 1, 8),
('f', 2, 9),
('f', 3, 5),
('f', 10, 12),
('f', 11, 13),
('f', 14, 15),
('f', 16, 21),
('f', 18, 18),
('f', 20, 21)
)
group by
id) as t
array join arr_groups as data
group by
id,
period_group
order by
id,
period_group;
【讨论】: