我见过大量使用带有循环的标量 udf 的解决方案,但我不喜欢其中任何一种,所以用不同的方法将我的帽子扔进戒指。
在数字表的帮助下,您可以将每个值解构为单独的字符,删除非数字字符,然后使用FOR XML 重构它以连接行,例如
WITH Numbers (Number) AS
( SELECT ROW_NUMBER() OVER(ORDER BY N1.N)
FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N1 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N2 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N3 (N) -- 1,000
--CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N4 (N) -- 10,000
--CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N5 (N) -- 100,000
--COMMENT OR UNCOMMENT ROWS AS NECESSARY DEPENDING ON YOU MAX STRING LENGTH
)
SELECT t.dats,
Stripped = x.data.value('.', 'INT')
FROM @tabl AS t
CROSS APPLY
( SELECT SUBSTRING(t.dats, n.Number, 1)
FROM Numbers n
WHERE n.Number <= LEN(t.dats)
AND SUBSTRING(t.dats, n.Number, 1) LIKE '[0-9]'
ORDER BY n.Number
FOR XML PATH(''), TYPE
) x (data);
给予:
dats Stripped
----------------------
103-P705hh 103705
115-xxx-44 11544
103-705.13 10370513
525-hheef4 5254
我没有进行任何测试,因此将每个字符串扩展为单个字符并重新构建它所增加的开销实际上比带有循环的 UDF 开销要大得多。
我决定对此进行基准测试
1.设置函数
CREATE FUNCTION dbo.ExtractNumeric_TVF (@Input VARCHAR(8000))
RETURNS TABLE
AS
RETURN
( WITH Numbers (Number) AS
( SELECT TOP (LEN(@Input)) ROW_NUMBER() OVER(ORDER BY N1.N)
FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N1 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N2 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N3 (N) -- 1,000
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N4 (N) -- 10,000
)
SELECT Stripped = x.data.value('.', 'VARCHAR(MAX)')
FROM ( SELECT SUBSTRING(@Input, n.Number, 1)
FROM Numbers n
WHERE n.Number <= LEN(@Input)
AND SUBSTRING(@Input, n.Number, 1) LIKE '[0-9]'
ORDER BY n.Number
FOR XML PATH(''), TYPE
) x (data)
);
GO
create function dbo.ExtractNumeric_UDF(@s varchar(8000))
returns varchar(8000)
as
begin
declare @out varchar(max) = ''
declare @c char(1)
while len(@s) > 0 begin
set @c = left(@s,1)
if @c like '[0123456789]' set @out += @c
set @s = substring(@s, 2, len(@s) -1)
end
return @out
end
GO
2。创建第一组样本数据和日志表
CREATE TABLE dbo.T (Value VARCHAR(8000) NOT NULL);
INSERT dbo.T (Value)
SELECT TOP 1000 LEFT(NEWID(), CEILING(RAND(CHECKSUM(NEWID())) * 36))
FROM sys.all_objects a
CROSS JOIN sys.all_objects b;
CREATE TABLE dbo.TestLog (Fx VARCHAR(255), NumberOfRows INT, TimeStart DATETIME2(7), TimeEnd DATETIME2(7))
3.运行测试
GO
DECLARE @T TABLE (Val VARCHAR(8000));
INSERT dbo.TestLog (fx, NumberOfRows, TimeStart)
VALUES ('dbo.ExtractNumeric_UDF', 1000, SYSDATETIME());
INSERT @T (Val)
SELECT dbo.ExtractNumeric_UDF(Value)
FROM dbo.T;
UPDATE dbo.TestLog
SET TimeEnd = SYSDATETIME()
WHERE TimeEnd IS NULL;
GO 100
DECLARE @T TABLE (Val VARCHAR(8000));
INSERT dbo.TestLog (fx, NumberOfRows, TimeStart)
VALUES ('dbo.ExtractNumeric_TVF', 1000, SYSDATETIME());
INSERT @T (Val)
SELECT f.Stripped
FROM dbo.T
CROSS APPLY dbo.ExtractNumeric_TVF(Value) f;
UPDATE dbo.TestLog
SET TimeEnd = SYSDATETIME()
WHERE TimeEnd IS NULL;
GO 100
4.获取结果
SELECT Fx,
NumberOfRows,
RunTime = AVG(DATEDIFF(MILLISECOND, TimeStart, TimeEnd))
FROM dbo.TestLog
GROUP BY fx, NumberOfRows;
我在 1,000 和 10,000 行中执行了以下操作(仅使用 NEWID(),因此最多只能使用 36 个字符),结果是:
Fx NumberOfRows RunTime
--------------------------------------------------------
dbo.ExtractNumeric_TVF 1000 31
dbo.ExtractNumeric_UDF 1000 56
dbo.ExtractNumeric_TVF 10000 280
dbo.ExtractNumeric_UDF 10000 510
因此,TVF 的到来时间不到 UDF 的一半。
我想测试边缘情况,所以放置了 1,000 行较长的字符串(5,400 个字符)
TRUNCATE TABLE dbo.T;
INSERT dbo.T (Value)
SELECT TOP 1000
REPLICATE(CONCAT(NEWID(), NEWID(), NEWID(), NEWID(), NEWID()), 30)
FROM sys.all_objects a
CROSS JOIN sys.all_objects b;
这就是 TVF 发挥作用的地方,运行速度提高了 5 倍以上:
Fx NumberOfRows RunTime
------------------------------------------------
dbo.ExtractNumeric_TVF 1000 2485
dbo.ExtractNumeric_UDF 1000 12955