It tries to translate unicode characters to ascii.
Why:
When i saw the topic for removing LetterAccent I remembered that i've done somthing similar to that. I needed to transcript names to ascii. Swedish names,russian and others.
Credits:
After searching the web i found a perlscript named Unidecode. Mr Burke has done the whole transcription of Unicodesymbols to ascii.
Not all of course but a lot.
What i did:
I use his translated files and merged them to one big texfile of ~400kB.
You can load this file to an array and could easily translate a lot of unicode to ascii-chars.
At first you've to download his files and extract them to a directory. You can delete everything besides the unidecode Dir.
Put the follwing script above this dir and run it to make the textfile unidecode.tbl
Code: Select all
makeUniDecodetablefile()
return
makeUniDecodetablefile(pathToUnidecodeDir="unidecode",tablename="unidecode.tbl"){
b:=[]
i:=0
Loop, Files, %pathToUnidecodeDir%\*.pm
{
FileRead, OutputVar, %A_LoopFileFullPath%
index:= PerlfilePMToVar(OutputVar)
SetFormat, IntegerFast, d
i := index +1
;for debugging
;c:=count(OutputVar)
;msgbox, % i " " index ", " c ", "OutputVar
b[i]:=OutputVar
}
for i, element in b
FileAppend , %element% `, `n,%tablename%
}
PerlfilePMToVar(ByRef haystack){
static test
result:=haystack
Pattern1:= "i),\s+(#.*?`n)"
Pattern2:= "i)(#\s+BLOCK.*?`n)"
pos:=1
while pos
{
pos := RegExMatch(Haystack, pattern1, match, pos + strlen(match))
result:=strReplace(result,match1,"")
}
pos:=1
while pos
{
pos := RegExMatch(Haystack, pattern2, matcher, pos + strlen(matcher))
result:=strReplace(result,matcher1,"")
}
match2:=""
result:=strReplace(result,"`n")
Pattern:= "i)\[(.*)\]\s+=\s+\[(.*),\]"
Pattern2:= "i)\[(.*)\]\s+=\s+Text"
if !InStr(result,"make_placeholder_map")
pos := RegExMatch(result, pattern, match)
else
{
pos := RegExMatch(result, pattern2, match)
loop,255
match2 .= """"","
match2 .= """"""
}
match2:=strReplace(match2,"`n")
haystack:=match2
return match1
}
count(Text){
; only for debugging
count:=0
Text:= strreplace(text,"qq{,,}","qq{zweikomma}")
text:= strreplace(Text,"qq{,}","qq{einkomma}")
text:= strreplace(Text,"qq{, }","qq{dreikomma}")
Loop, Parse,Text,`,
{
x:=trim(A_LoopField)
if (x = "qq{zweikomma}")
count++
else if (x = "qq{einkomma}")
count++
else if (x = "qq{dreikomma}")
count++
else if (x = """""")
count++
else if (InStr(x,"qq{"))
count++
else
count++
}
return count
}
(put the file unidecode.tbl in the same dir)
Try it in the chinese forum. Dont know if there is a meaning for the ascii chars.
Code: Select all
;text= €€€€€@@@ßäÄüÜö
;msgbox, % unidecode(text)
!^u::msgbox, % unidecode(clipboard,"äÄ")
return
unidecode(text, donotdecode=""){
static a
Transform, text, HTML, %text% ,2
if (donotdecode<>"")
Loop, Parse,donotdecode
{
Transform, dn, HTML, %A_Loopfield% ,2
text := strReplace(text,dn,A_Loopfield)
}
u:=getDecUnicode(text)
Sort u, N D, U
if !(a.length()=65536){
a:=[]
FileRead, tbl, unidecode.tbl
a:=unidecodeTable2Array(tbl,a)
}
if !(a.length()=65536){
msgbox, % "Error loading unidecode.tbl. Array length is " a.length() " instead of 65536."
exitapp
}
Loop, Parse,u,`,
text := strReplace(text,"&#" . A_Loopfield ";", a[A_Loopfield])
return text
}
unidecodeTable2Array(Text,array){
loop, parse, Text,`n
{
usatz:= strreplace(A_LoopField,"qq{,,}","qq{zweikomma}")
usatz:= strreplace(usatz,"qq{,}","qq{einkomma}")
usatz:= strreplace(usatz,"qq{, }","qq{dreikomma}")
i:=(a_index-1)*256
;msgbox, % "i" i
index:=0+i
Loop, Parse,usatz,`,
{
x:=trim(A_LoopField)
if (x = "qq{zweikomma}")
array[Index]:=",,"
else if (x = "qq{einkomma}")
array[Index]:=","
else if (x = "qq{dreikomma}")
array[Index]:=", "
else if (x = """""")
array[Index]:=""
else if (InStr(x,"qq{"))
array[Index]:=substr(x,4,strlen(x)-4)
else
array[Index]:=substr(x,2,strlen(x)-2)
index++
}
}
return array
}
getDecUnicode(haystack){
Pattern:= "i)&#(\d+)?;"
pos:=1
while pos
{
pos := RegExMatch(Haystack, pattern, match, pos + strlen(match))
if (result ="") and (match1<>"")
result := match1
else if (match1<>"")
result .= "," . match1
}
return result
}
⠁⠥⠞⠕⠓⠕⠞⠅⠑⠽ makes it possible!