I am new to power query and I've never come across XML files before. I have an xml data source which when opened in excel had lots of XML tags in it that need cleansing. I found a VBA function which I adapted to give me my desired cleansed data. Its a bit messy (as is the data) but it works. What is the most efficient way to do this in power query? I tried List.RemoveItems and the standard find and replace. I got stuck at trying to find equivalent of "<[^>]+>". Thank you in advance. The following is the VBA that I adapted
>>> use code - tags <<<
>>> use code - tags <<<
Code:
Function StripHTML(cell As Range) As String Dim RegEx As Object Set RegEx = CreateObject("vbscript.regexp") Dim sInput As String Dim sOut As String sInput = cell.Text
sInput = Replace(sInput, "\x0D\x0A", Chr(10))
sInput = Replace(sInput, "\x00", Chr(10))
'replace HTML breaks and end of paragraphs with line breaks
sInput = Replace(sInput, "</p>" & Chr(10) & "<p> </p>", Chr(10) & Chr(10))
sInput = Replace(sInput, "<BR>", Chr(10))
sInput = Replace(sInput, "<br />&chr(10)&<br />", Chr(10))
sInput = Replace(sInput, "<br /><br />", Chr(10))
sInput = Replace(sInput, "<br /> <br />", Chr(10))
sInput = Replace(sInput, "<br /> <br />", Chr(10))
sInput = Replace(sInput, "<br />" & Chr(10) & " <br />", Chr(10))
sInput = Replace(sInput, "<br /> <br /> ", Chr(10))
sInput = Replace(sInput, Chr(10) & " " & Chr(10), Chr(10))
sInput = Replace(sInput, "<br />", Chr(10))
sInput = Replace(sInput, "<ol>" & Chr(10) & "<li>", "")
sInput = Replace(sInput, "</ol>" & Chr(10) & "</li>" & Chr(10) & "<li>", "")
sInput = Replace(sInput, "</ol>" & Chr(10) & "</li>" & Chr(10) & "</ol>" & Chr(10) & "</li>" & Chr(10) & "<li>", "")
sInput = Replace(sInput, "<ul>" & Chr(10) & "<li>", "")
sInput = Replace(sInput, Chr(10) & Chr(10) & Chr(10) & Chr(10) & Chr(10) & Chr(10), Chr(10))
sInput = Replace(sInput, Chr(10) & Chr(10) & Chr(10) & Chr(10) & Chr(10), Chr(10))
sInput = Replace(sInput, Chr(10) & Chr(10) & Chr(10) & Chr(10), Chr(10))
sInput = Replace(sInput, Chr(10) & Chr(10) & Chr(10), Chr(10))
sInput = Replace(sInput, Chr(10) & Chr(10), Chr(10))
sInput = Replace(sInput, Chr(10) & " ", Chr(10))
'add back all of the special characters
sInput = Replace(sInput, "–", "–")
sInput = Replace(sInput, "—", "—")
sInput = Replace(sInput, "¡", "¡")
sInput = Replace(sInput, "¿", "¿")
sInput = Replace(sInput, """, "")
sInput = Replace(sInput, "“", "")
sInput = Replace(sInput, "”", "")
sInput = Replace(sInput, "", "'")
sInput = Replace(sInput, "‘", "'")
sInput = Replace(sInput, "’", "’")
sInput = Replace(sInput, "«", "«")
sInput = Replace(sInput, "»", "»")
sInput = Replace(sInput, " ", " ")
sInput = Replace(sInput, "&", "&")
sInput = Replace(sInput, "¢", "¢")
sInput = Replace(sInput, "©", "©")
sInput = Replace(sInput, "÷", "÷")
sInput = Replace(sInput, ">", ">")
sInput = Replace(sInput, "<", "<")
sInput = Replace(sInput, "µ", "µ")
sInput = Replace(sInput, "·", "·")
sInput = Replace(sInput, "¶", "¶")
sInput = Replace(sInput, "±", "±")
sInput = Replace(sInput, "€", "€")
sInput = Replace(sInput, "£", "£")
sInput = Replace(sInput, "®", "®")
sInput = Replace(sInput, "§", "§")
sInput = Replace(sInput, "™", "™")
sInput = Replace(sInput, "¥", "¥")
sInput = Replace(sInput, "á", "á")
sInput = Replace(sInput, "Á", "Á")
sInput = Replace(sInput, "à", "à")
sInput = Replace(sInput, "À", "À")
sInput = Replace(sInput, "â", "â")
sInput = Replace(sInput, "Â", "Â")
sInput = Replace(sInput, "å", "å")
sInput = Replace(sInput, "Å", "Å")
sInput = Replace(sInput, "ã", "ã")
sInput = Replace(sInput, "Ã", "Ã")
sInput = Replace(sInput, "ä", "ä")
sInput = Replace(sInput, "Ä", "Ä")
sInput = Replace(sInput, "æ", "æ")
sInput = Replace(sInput, "Æ", "Æ")
sInput = Replace(sInput, "ç", "ç")
sInput = Replace(sInput, "Ç", "Ç")
sInput = Replace(sInput, "é", "é")
sInput = Replace(sInput, "É", "É")
sInput = Replace(sInput, "è", "è")
sInput = Replace(sInput, "È", "È")
sInput = Replace(sInput, "ê", "ê")
sInput = Replace(sInput, "Ê", "Ê")
sInput = Replace(sInput, "ë", "ë")
sInput = Replace(sInput, "Ë", "Ë")
sInput = Replace(sInput, "í", "í")
sInput = Replace(sInput, "Í", "Í")
sInput = Replace(sInput, "ì", "ì")
sInput = Replace(sInput, "Ì", "Ì")
sInput = Replace(sInput, "î", "î")
sInput = Replace(sInput, "Î", "Î")
sInput = Replace(sInput, "ï", "ï")
sInput = Replace(sInput, "Ï", "Ï")
sInput = Replace(sInput, "ñ", "ñ")
sInput = Replace(sInput, "Ñ", "Ñ")
sInput = Replace(sInput, "ó", "ó")
sInput = Replace(sInput, "Ó", "Ó")
sInput = Replace(sInput, "ò", "ò")
sInput = Replace(sInput, "Ò", "Ò")
sInput = Replace(sInput, "ô", "ô")
sInput = Replace(sInput, "Ô", "Ô")
sInput = Replace(sInput, "ø", "ø")
sInput = Replace(sInput, "Ø", "Ø")
sInput = Replace(sInput, "õ", "õ")
sInput = Replace(sInput, "Õ", "Õ")
sInput = Replace(sInput, "ö", "ö")
sInput = Replace(sInput, "Ö", "Ö")
sInput = Replace(sInput, "ß", "ß")
sInput = Replace(sInput, "ú", "ú")
sInput = Replace(sInput, "Ú", "Ú")
sInput = Replace(sInput, "ù", "ù")
sInput = Replace(sInput, "Ù", "Ù")
sInput = Replace(sInput, "û", "û")
sInput = Replace(sInput, "Û", "Û")
sInput = Replace(sInput, "ü", "ü")
sInput = Replace(sInput, "Ü", "Ü")
sInput = Replace(sInput, "ÿ", "ÿ")
sInput = Replace(sInput, "", "´")
sInput = Replace(sInput, "", "`")
sInput = Replace(sInput, "'", "'")
sInput = Replace(sInput, "º", "º")
sInput = Replace(sInput, "°", "º")
sInput = Replace(sInput, "•", Chr(10) & "-")
sInput = Replace(sInput, "²", "²")
sInput = Replace(sInput, "' && task.HTMLContent != '", "")
sInput = Replace(sInput, "' && task.HTMLNotes != '", "")
sInput = Replace(sInput, "' -->", "")
sInput = Replace(sInput, "≤", "<=")
sInput = Replace(sInput, "½", "½")
sInput = Replace(sInput, "¼", "¼")
sInput = Replace(sInput, "¾", "¾")
sInput = Replace(sInput, "μ", ChrW(181))
sInput = Replace(sInput, "Ω", ChrW(937))
'replace all the remaining HTML Tags
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = "<[^>]+>" 'Regular Expression for HTML Tags.
End With
sInput = Replace(sInput, Chr(10) & Chr(10), Chr(10))
sOut = RegEx.Replace(sInput, "")
StripHTML = sOut
Set RegEx = Nothing
End Function
Last edited by a moderator: