HTML Decoder

Post your working scripts, libraries and tools
Freire
Posts: 15
Joined: 10 Dec 2017, 11:30

HTML Decoder

04 Nov 2018, 11:53

If you want to encode string to HTML: https://autohotkey.com/docs/commands/Transform.htm#HTML

HTML Decoder

This function translates HTML encoded string back to the original string.
It does have all listed HTML4 named Entities. IMO it still does a very good cover in general (Talking with HTML5 in mind).
Most of the new conventions aren't used yet. Entities References: https://en.wikipedia.org/wiki/List_of_X ... es_in_HTML

It supports Decimal encode, Hexadecimal encode and Named encode: Eg: "&" "&" "&" Are all equal to: "&"

Thanks to @CerpinTaxt[Discord] for some help with visual arrangement.

Code: Select all

htmlDecode(str) {
  html_array_1 := {"exclamation":33,"quot":34,"percent":37,"amp":38,"apos":39,"add":43,"lt":60,"equal":61,"gt":62,"nbsp":160,"iexcl":161,"cent":162,"pound":163,"curren":164,"yen":165,"brvbar":166,"sect":167,"uml":168,"copy":169,"ordf":170,"laquo":171,"not":172,"shy":173,"reg":174,"macr":175,"deg":176,"plusmn":177,"sup2":178,"sup3":179,"acute":180,"micro":181,"para":182,"middot":183,"cedil":184,"sup1":185,"ordm":186,"raquo":187,"frac14":188,"frac12":189,"frac34":190,"iquest":191,"Agrave":192,"Aacute":193,"Acirc":194,"Atilde":195,"Auml":196,"Aring":197,"AElig":198,"Ccedil":199,"Egrave":200,"Eacute":201,"Ecirc":202,"Euml":203,"Igrave":204,"Iacute":205,"Icirc":206,"Iuml":207,"ETH":208,"Ntilde":209,"Ograve":210,"Oacute":211,"Ocirc":212,"Otilde":213,"Ouml":214,"times":215,"Oslash":216,"Ugrave":217,"Uacute":218,"Ucirc":219}
  html_array_2 := {"Uuml":220,"Yacute":221,"THORN":222,"szlig":223,"agrave":224,"aacute":225,"acirc":226,"atilde":227,"auml":228,"aring":229,"aelig":230,"ccedil":231,"egrave":232,"eacute":233,"ecirc":234,"euml":235,"igrave":236,"iacute":237,"icirc":238,"iuml":239,"eth":240,"ntilde":241,"ograve":242,"oacute":243,"ocirc":244,"otilde":245,"ouml":246,"divide":247,"oslash":248,"ugrave":249,"uacute":250,"ucirc":251,"uuml":252,"yacute":253,"thorn":254,"yuml":255,"OElig":338,"oelig":339,"Scaron":352,"scaron":353,"Yuml":376,"fnof":402,"circ":710,"tilde":732,"Alpha":913,"Beta":914,"Gamma":915,"Delta":916,"Epsilon":917,"Zeta":918,"Eta":919,"Theta":920,"Iota":921,"Kappa":922,"Lambda":923,"Mu":924,"Nu":925,"Xi":926,"Omicron":927,"Pi":928,"Rho":929,"Sigma":931}
  html_array_3 := {"Tau":932,"Upsilon":933,"Phi":934,"Chi":935,"Psi":936,"Omega":937,"alpha":945,"beta":946,"gamma":947,"delta":948,"epsilon":949,"zeta":950,"eta":951,"theta":952,"iota":953,"kappa":954,"lambda":955,"mu":956,"nu":957,"xi":958,"omicron":959,"pi":960,"rho":961,"sigmaf":962,"sigma":963,"tau":964,"upsilon":965,"phi":966,"chi":967,"psi":968,"omega":969,"thetasym":977,"upsih":978,"piv":982,"ensp":8194,"emsp":8195,"thinsp":8201,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"ndash":8211,"mdash":8212,"horbar":8213,"lsquo":8216,"rsquo":8217,"sbquo":8218,"ldquo":8220,"rdquo":8221,"bdquo":8222,"dagger":8224,"Dagger":8225,"bull":8226,"hellip":8230,"permil":8240,"prime":8242,"Prime":8243,"lsaquo":8249,"rsaquo":8250,"oline":8254,"frasl":8260,"euro":8364,"image":8465,"weierp":8472,"real":8476,"trade":8482,"alefsym":8501,"larr":8592,"uarr":8593,"rarr":8594,"darr":8595,"harr":8596,"crarr":8629,"lArr":8656,"uArr":8657,"rArr":8658,"dArr":8659,"hArr":8660,"forall":8704,"part":8706,"exist":8707,"empty":8709,"nabla":8711,"isin":8712,"notin":8713,"ni":8715,"prod":8719,"sum":8721,"minus":8722,"lowast":8727,"radic":8730,"prop":8733,"infin":8734,"ang":8736,"and":8743,"or":8744,"cap":8745,"cup":8746,"int":8747,"there4":8756,"sim":8764,"cong":8773,"asymp":8776,"ne":8800,"equiv":8801,"le":8804,"ge":8805,"sub":8834,"sup":8835,"nsub":8836,"sube":8838,"supe":8839,"oplus":8853,"otimes":8855,"perp":8869,"sdot":8901,"lceil":8968,"rceil":8969,"lfloor":8970,"rfloor":8971,"lang":9001,"rang":9002,"loz":9674,"spades":9824,"clubs":9827,"hearts":9829,"diams":9830}

  While RegExMatch(str, "(?<=&#)\d+(?=;)", char)
    StringReplace, str, str, % "&#" char ";" , % Chr(char), All
  While RegExMatch(str, "i)(?<=&#x)[a-f0-9]+(?=;)", char)
    StringReplace, str, str, % "&#x" char ";" , % Chr("0x"char), All
  While RegExMatch(str, "(?<=&)\w+(?=;)", char) {
    if (html_array_1[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_1[char]), All
    } Else If (html_array_2[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_2[char]), All
    } Else If (html_array_3[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_3[char]), All
    } Else {
      MsgBox, % "ERROR:`nFound an Unkown Named Entity: &" char ";`nIt will be ignored."
      StringReplace, str, str, % "&" char ";" , % "", All
    }
  }
  Return str
}
Validation code example:

Code: Select all

encoded_str=
(`%
<div class="header-wrap--home  js-header-wrap">
<div class="header--aside js-header-aside"><a class="header__button--menu  js-side-menu-open" href="#">%&#168;$#&%&@*#(!)(@#(<>:ASD?</a><div class="header--aside__item showcase header__label"><span class="header__clickable js-hl-button" data-type="showcase">
)
decoded_str=
(`%
<div class="header-wrap--home  js-header-wrap">
<div class="header--aside js-header-aside"><a class="header__button--menu  js-side-menu-open" href="#">%¨$#&%&@*#(!)(@#(<>:ASD?</a><div class="header--aside__item showcase header__label"><span class="header__clickable js-hl-button" data-type="showcase">
)

msgbox, % htmlDecode(encoded_str)
msgbox, % htmlDecode(encoded_str)=decoded_str?"Working":"Not Working"

Return

htmlDecode(str) {
  html_array_1 := {"exclamation":33,"quot":34,"percent":37,"amp":38,"apos":39,"add":43,"lt":60,"equal":61,"gt":62,"nbsp":160,"iexcl":161,"cent":162,"pound":163,"curren":164,"yen":165,"brvbar":166,"sect":167,"uml":168,"copy":169,"ordf":170,"laquo":171,"not":172,"shy":173,"reg":174,"macr":175,"deg":176,"plusmn":177,"sup2":178,"sup3":179,"acute":180,"micro":181,"para":182,"middot":183,"cedil":184,"sup1":185,"ordm":186,"raquo":187,"frac14":188,"frac12":189,"frac34":190,"iquest":191,"Agrave":192,"Aacute":193,"Acirc":194,"Atilde":195,"Auml":196,"Aring":197,"AElig":198,"Ccedil":199,"Egrave":200,"Eacute":201,"Ecirc":202,"Euml":203,"Igrave":204,"Iacute":205,"Icirc":206,"Iuml":207,"ETH":208,"Ntilde":209,"Ograve":210,"Oacute":211,"Ocirc":212,"Otilde":213,"Ouml":214,"times":215,"Oslash":216,"Ugrave":217,"Uacute":218,"Ucirc":219}
  html_array_2 := {"Uuml":220,"Yacute":221,"THORN":222,"szlig":223,"agrave":224,"aacute":225,"acirc":226,"atilde":227,"auml":228,"aring":229,"aelig":230,"ccedil":231,"egrave":232,"eacute":233,"ecirc":234,"euml":235,"igrave":236,"iacute":237,"icirc":238,"iuml":239,"eth":240,"ntilde":241,"ograve":242,"oacute":243,"ocirc":244,"otilde":245,"ouml":246,"divide":247,"oslash":248,"ugrave":249,"uacute":250,"ucirc":251,"uuml":252,"yacute":253,"thorn":254,"yuml":255,"OElig":338,"oelig":339,"Scaron":352,"scaron":353,"Yuml":376,"fnof":402,"circ":710,"tilde":732,"Alpha":913,"Beta":914,"Gamma":915,"Delta":916,"Epsilon":917,"Zeta":918,"Eta":919,"Theta":920,"Iota":921,"Kappa":922,"Lambda":923,"Mu":924,"Nu":925,"Xi":926,"Omicron":927,"Pi":928,"Rho":929,"Sigma":931}
  html_array_3 := {"Tau":932,"Upsilon":933,"Phi":934,"Chi":935,"Psi":936,"Omega":937,"alpha":945,"beta":946,"gamma":947,"delta":948,"epsilon":949,"zeta":950,"eta":951,"theta":952,"iota":953,"kappa":954,"lambda":955,"mu":956,"nu":957,"xi":958,"omicron":959,"pi":960,"rho":961,"sigmaf":962,"sigma":963,"tau":964,"upsilon":965,"phi":966,"chi":967,"psi":968,"omega":969,"thetasym":977,"upsih":978,"piv":982,"ensp":8194,"emsp":8195,"thinsp":8201,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"ndash":8211,"mdash":8212,"horbar":8213,"lsquo":8216,"rsquo":8217,"sbquo":8218,"ldquo":8220,"rdquo":8221,"bdquo":8222,"dagger":8224,"Dagger":8225,"bull":8226,"hellip":8230,"permil":8240,"prime":8242,"Prime":8243,"lsaquo":8249,"rsaquo":8250,"oline":8254,"frasl":8260,"euro":8364,"image":8465,"weierp":8472,"real":8476,"trade":8482,"alefsym":8501,"larr":8592,"uarr":8593,"rarr":8594,"darr":8595,"harr":8596,"crarr":8629,"lArr":8656,"uArr":8657,"rArr":8658,"dArr":8659,"hArr":8660,"forall":8704,"part":8706,"exist":8707,"empty":8709,"nabla":8711,"isin":8712,"notin":8713,"ni":8715,"prod":8719,"sum":8721,"minus":8722,"lowast":8727,"radic":8730,"prop":8733,"infin":8734,"ang":8736,"and":8743,"or":8744,"cap":8745,"cup":8746,"int":8747,"there4":8756,"sim":8764,"cong":8773,"asymp":8776,"ne":8800,"equiv":8801,"le":8804,"ge":8805,"sub":8834,"sup":8835,"nsub":8836,"sube":8838,"supe":8839,"oplus":8853,"otimes":8855,"perp":8869,"sdot":8901,"lceil":8968,"rceil":8969,"lfloor":8970,"rfloor":8971,"lang":9001,"rang":9002,"loz":9674,"spades":9824,"clubs":9827,"hearts":9829,"diams":9830}

  While RegExMatch(str, "(?<=&#)\d+(?=;)", char)
    StringReplace, str, str, % "&#" char ";" , % Chr(char), All
  While RegExMatch(str, "i)(?<=&#x)[a-f0-9]+(?=;)", char)
    StringReplace, str, str, % "&#x" char ";" , % Chr("0x"char), All
  While RegExMatch(str, "(?<=&)\w+(?=;)", char) {
    if (html_array_1[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_1[char]), All
    } Else If (html_array_2[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_2[char]), All
    } Else If (html_array_3[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_3[char]), All
    } Else {
      MsgBox, % "ERROR:`nFound an Unkown Named Entity: &" char ";`nIt will be ignored."
      StringReplace, str, str, % "&" char ";" , % "", All
    }
  }
  Return str
}
Last edited by Freire on 06 Nov 2018, 06:26, edited 4 times in total.
burque505
Posts: 684
Joined: 22 Jan 2017, 19:37

Re: HTML Decoder

05 Nov 2018, 09:11

Thank you Freire, it works well for me so far.
I changed

Code: Select all

msgbox, % htmlDecode(encoded_str)
msgbox, % htmlDecode(encoded_str)=str_confirm?"Working":"Not Working"
to

Code: Select all

msgbox, % htmlDecode(encoded_str)
msgbox, % htmlDecode(encoded_str)=decoded_str?"Working":"Not Working"
in the validation example to match the decoded string rather than one that wasn't there :)
Regards,
burque505
Freire
Posts: 15
Joined: 10 Dec 2017, 11:30

Re: HTML Decoder

05 Nov 2018, 16:06

Thanks, I've just edited. Now it's the correct variable.
User avatar
SKAN
Posts: 343
Joined: 29 Sep 2013, 16:58

Re: HTML Decoder

18 Nov 2018, 08:01

I've been planning to upgrade my old function to support HTML 4.
I wouldn't use associative array(s) to resolve named entities.. Associative array keys aren't case sensitive.
In your function (for eg.) both Aacute (193) and aacute (225) resolves to Aacute (193) only.

Same problem with StringReplace command ( deprecated. Use StrReplace() ).
It will replace all Aacute and aacute in one shot unless StringCaseSense is on/1.
RegExReplace() would be considerably slower, I guess.

Then there is the problem of uppercase entities (for eg. &GT;) which needs to resolved as case insensitive match.
User avatar
kczx3
Posts: 694
Joined: 06 Oct 2015, 21:39

Re: HTML Decoder

18 Nov 2018, 20:08

SKAN, I think you mean HTML 5
User avatar
SKAN
Posts: 343
Joined: 29 Sep 2013, 16:58

Re: HTML Decoder

18 Nov 2018, 21:09

kczx3 wrote:
18 Nov 2018, 20:08
SKAN, I think you mean HTML 5
Why? You need it? :)
I did mean HTML 4. Its simply a matter of replacing the HTML 4 lookup table with HTML 5.
It only takes 3 lines for me to resolve HTML entities but here follows the 26KB HTML 5 lookup table that won't even init in one Static variable.

Spoiler
User avatar
kczx3
Posts: 694
Joined: 06 Oct 2015, 21:39

Re: HTML Decoder

18 Nov 2018, 21:13

I guess my point was that no one should still be writing HTML 4.
User avatar
SKAN
Posts: 343
Joined: 29 Sep 2013, 16:58

Re: HTML Decoder

18 Nov 2018, 21:27

I guess my point was that no one should still be writing HTML 4.
Ah, I see... Point noted. I rarely write HTML... :)
Freire
Posts: 15
Joined: 10 Dec 2017, 11:30

Re: HTML Decoder

24 Nov 2018, 00:16

Thanks for the feedback. I'll be implementing the changes.
I haven't implemented the HTML5 because I haven't seen a need for that, and as SKAN mentioned: The table is too big.
Even with the much smaller HTML4 it already doesn't fit into one variable alone.
Skan, if you update your function, post link here I'll be glad to use it.
User avatar
SKAN
Posts: 343
Joined: 29 Sep 2013, 16:58

Re: HTML Decoder

24 Nov 2018, 07:43

Skan, if you update your function, post link here I'll be glad to use it.
Sure. It will take a while though. I am writing it using InStr() and SubStr(), but the problem is
StringReplace() has to be called between StringCaseSense, On and StringCaseSense, Off
I can't use StringCaseSense, On at the top of function as it overrides the case sensitivity parameter of InStr().

If you're interested, I can post here a (slower) RegEx powered version which you can adapt to your needs.

:)

Return to “Scripts and Functions”

Who is online

Users browsing this forum: No registered users and 23 guests