Thursday, January 5, 2012

clean text data with regex in SQL Server

CREATE FUNCTION dbo.tokenizer
(
    @string varchar(8000)
)
returns varchar(8000) AS
BEGIN
    DECLARE @handle int, @result varchar(8000)


    exec sp_oacreate 'vbscript.regexp', @handle output
    exec sp_oasetproperty @handle, 'pattern', '([{}\(\)\^$&._%#!@=<>:;,~`\’\*\?\/\+\|\[\\\\]|\]|\-)'
    exec sp_oasetproperty @handle, 'global', 'true'
    exec sp_oasetproperty @handle, 'ignorecase', 0
    exec sp_oamethod @handle, 'replace', @result output, @string, ' '
    exec sp_oadestroy @handle

    RETURN rtrim(ltrim(lower(replace(replace(replace(@result,' ','<>'),'><',''),'<>',' '))))
END

No comments:

Post a Comment