Tuesday, June 26, 2012

VBA, URL to Root Domain

Public Function ReverseString(text As String)
    ReverseString = StrReverse(text)
End Function

Public Function getRootDomain(text As String)
    Dim i
   
    ' Remove protocol
    text = Replace(text, "http://", "")
    text = Replace(text, "https://", "")
    text = Replace(text, "ftp://", "")
   
    ' Remove trailing / and path
    i = InStr(text, "/")
    If (i > 0) Then
        text = Left(text, InStr(text, "/") - 1)
    End If
   
    If Left(Right(text, 5), 1) = "." Then     '  info TLD
   
        If InStr(6, ReverseString(text), ".") > 0 Then
            text = Right(text, InStr(6, ReverseString(text), ".") - 1)
        Else
            text = Replace(text, "www.", "")
        End If
   
    ElseIf Left(Right(text, 4), 1) = "." Then   '.xxx TLD
   
        If InStr(5, ReverseString(text), ".") > 0 Then
            text = Right(text, InStr(5, ReverseString(text), ".") - 1)
        Else
            text = Replace(text, "www.", "")
        End If
   
    Else
        'xxx.TLD.uk
        If Left(Right(text, 7), 1) = "." Then
            If InStr(8, ReverseString(text), ".") > 0 Then
                text = Right(text, InStr(8, ReverseString(text), ".") - 1)
            Else   'we don't not handle http://www.wp.pl/ case well
                text = Replace(text, "www.", "")
            End If
   
        'xxx.xx.uk
        ElseIf Left(Right(text, 6), 1) = "." Then
            If InStr(7, ReverseString(text), ".") > 0 Then
                text = Right(text, InStr(7, ReverseString(text), ".") - 1)
            Else   'we don't not handle http://www.wp.pl/ case well
                text = Replace(text, "www.", "")
            End If
   
        Else  'xxxx.uk
       
            If InStr(4, ReverseString(text), ".") > 0 Then
                text = Right(text, InStr(4, ReverseString(text), ".") - 1)
            Else
                text = Replace(text, "www.", "")
            End If
           
   
        End If
      
   
    End If
   
   
    getRootDomain = text
End Function

Friday, January 13, 2012

Excel URL to HOST

Excel URL to HOST. If the Host is of "www", it will be reduced to root domain.

=IF(LEFT(LEFT(SUBSTITUTE(SUBSTITUTE(A1,"https:","http:"),"http://",""),FIND("/",SUBSTITUTE(SUBSTITUTE(A1,"https:","http:"),"http://","")&"/")-1),4)="www.",MID(LEFT(SUBSTITUTE(SUBSTITUTE(A1,"https:","http:"),"http://",""),FIND("/",SUBSTITUTE(SUBSTITUTE(A1,"https:","http:"),"http://","")&"/")-1),5,256),LEFT(SUBSTITUTE(SUBSTITUTE(A1,"https:","http:"),"http://",""),FIND("/",SUBSTITUTE(SUBSTITUTE(A1,"https:","http:"),"http://","")&"/")-1))

Wednesday, January 11, 2012

clean text data with regex in MySQL

CREATE FUNCTION tokenizer(x VARCHAR(255))
RETURNS VARCHAR(255) DETERMINISTIC
RETURN replace(replace(replace(lower(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(x,':',' '),'~',' '),'!',' '),'@',' '),'#',' '),'$',' '),'%',' '),'*',' '),'(',' '),')',' '),'_',' '),'+',' '),'{',' '),'}',' '),'[',' '),']',' '),';',' '),' '' ',' '),':',' '),'"',' '),'<',' '),'>',' '),'?',' ')),' ','<>'),'><',''),'<>',' ');

Thursday, January 5, 2012

clean text data with regex in SQL Server

CREATE FUNCTION dbo.tokenizer
(
    @string varchar(8000)
)
returns varchar(8000) AS
BEGIN
    DECLARE @handle int, @result varchar(8000)


    exec sp_oacreate 'vbscript.regexp', @handle output
    exec sp_oasetproperty @handle, 'pattern', '([{}\(\)\^$&._%#!@=<>:;,~`\’\*\?\/\+\|\[\\\\]|\]|\-)'
    exec sp_oasetproperty @handle, 'global', 'true'
    exec sp_oasetproperty @handle, 'ignorecase', 0
    exec sp_oamethod @handle, 'replace', @result output, @string, ' '
    exec sp_oadestroy @handle

    RETURN rtrim(ltrim(lower(replace(replace(replace(@result,' ','<>'),'><',''),'<>',' '))))
END