AutoHotkey Community

Posted: **10 Oct 2013, 20:33**

with the help of this C implementation:

Spoiler

http://stackoverflow.com/questions/1031 ... in-plain-c
http://www.w3.org/International/questio ... orms-utf-8

_Bool is_utf8(const char * string)
{
    if(!string)
        return 0;

    const unsigned char * bytes = (const unsigned char *)string;
    while(*bytes)
    {
        if( (// ASCII
             // use bytes[0] <= 0x7F to allow ASCII control characters
                bytes[0] == 0x09 ||
                bytes[0] == 0x0A ||
                bytes[0] == 0x0D ||
                (0x20 <= bytes[0] && bytes[0] <= 0x7E)
            )
        ) {
            bytes += 1;
            continue;
        }

        if( (// non-overlong 2-byte
                (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF)
            )
        ) {
            bytes += 2;
            continue;
        }

        if( (// excluding overlongs
                bytes[0] == 0xE0 &&
                (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            ) ||
            (// straight 3-byte
                ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
                    bytes[0] == 0xEE ||
                    bytes[0] == 0xEF) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            ) ||
            (// excluding surrogates
                bytes[0] == 0xED &&
                (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            )
        ) {
            bytes += 3;
            continue;
        }

        if( (// planes 1-3
                bytes[0] == 0xF0 &&
                (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            ) ||
            (// planes 4-15
                (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            ) ||
            (// plane 16
                bytes[0] == 0xF4 &&
                (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            )
        ) {
            bytes += 4;
            continue;
        }

        return 0;
    }

    return 1;
}

here is a rough copy of the implementation:

Spoiler

Code: Select all

/*
http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
http://www.w3.org/International/questions/qa-forms-utf-8
	As an example, in Perl, a regular expression testing for UTF-8 may look as follows:

	$field =~
	  m/\A(
		 [\x09\x0A\x0D\x20-\x7E]            # ASCII
	   | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
	   |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
	   | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
	   |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
	   |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
	   | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
	   |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
	  )*\z/x;
*/

isBinFile(Filename,NumBytes=32,Minimum=4,detectunicode=1) {
	
	file:=FileOpen(Filename,"r")
	file.Position:=0 ;force position to 0 (zero)
	nbytes:=file.RawRead(rawbytes,NumBytes) ;read bytes
	file.Close() ;close file
	
	MsgBox % "internal: " is_utf8_x(nbytes,rawbytes)
	
	if (nbytes < Minimum) ;recommended 4 minimum for unicode detection
		return 0 ;asume text file, if too short
	
	i:=0, bytes:=[] ;Initialize vars
	
	loop % nbytes ;create c-style bytes array
		bytes[(A_Index-1)]:=Numget(&rawbytes,(A_Index-1),"UChar")
	
	;determine BOM if possible/existant
	if (bytes[0]=0xFE && bytes[1]=0xFF)
		|| (bytes[0]=0xFF && bytes[1]=0xFE)
		return 0 ;text Utf-16 BE/LE file
	if (bytes[0]=0xEF && bytes[1]=0xBB && bytes[2]=0xBF)
		|| (is_utf8_x(nbytes,rawbytes))
		return 0 ;text Utf-8 file
	if (bytes[0]=0x00 && bytes[1]=0x00
		&& bytes[2]=0xFE && bytes[3]=0xFF)
		|| (bytes[0]=0xFF && bytes[1]=0xFE
		&& bytes[2]=0x00 && bytes[3]=0x00)
		return 0 ;text Utf-32 BE/LE file
	
	loop, %nbytes% {
		if (bytes[(A_Index-1)]<9) or (bytes[(A_Index-1)]>126)
			or ((bytes[(A_Index-1)]<32) and (bytes[(A_Index-1)]>13))
			return 1
	}
	
	return 0
}

is_utf8_x(nbytes,rawbytes)
{
	i:=0, bytes:=[]
	
	loop % nbytes
		bytes[(A_Index-1)]:=Numget(&rawbytes,(A_Index-1),"UChar")	

	while(i<nbytes) {	
		;// ASCII
		if( bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0D
			|| (0x20 <= bytes[i] && bytes[i] <= 0x7E) ) {
			i += 1
			continue
		}
		
		;// non-overlong 2-byte
		if( (0xC2 <= bytes[i] && bytes[i] <= 0xDF)
			&& (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF) ) {
			i += 2
			continue
		}
		
		;// excluding overlongs, straight 3-byte, excluding surrogates
		if( ( bytes[i] == 0xE0 && (0xA0 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) )
			|| ( ((0xE1 <= bytes[i] && bytes[i] <= 0xEC)
			|| bytes[i] == 0xEE || bytes[i] == 0xEF)
			&& (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) 	)
			|| ( bytes[i] == 0xED && (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) ) ) {
			i += 3
			continue
		}
		
		;// planes 1-3, planes 4-15, plane 16
		if( ( bytes[i] == 0xF0 && (0x90 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
			&& (0x80 <= bytes[i+3] && bytes[i+3] <= 0xBF) )
			|| ( (0xF1 <= bytes[i] && bytes[i] <= 0xF3)
			&& (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
			&& (0x80 <= bytes[i+3] && bytes[i+3] <= 0xBF) )
			|| ( bytes[i] == 0xF4 && (0x80 <= bytes[i+1] && bytes[i+1] <= 0x8F)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
			&& (0x80 <= bytes[i+3] && bytes[i+3] <= 0xBF) ) ) {
			i += 4
			continue
		}
		
		return 0
	}

	return 1
}

Here is the resulting function:

Code: Select all

isBinFile(Filename,NumBytes=32,Minimum=4,complexunicode=1) {
	
	file:=FileOpen(Filename,"r")
	file.Position:=0 ;force position to 0 (zero)
	nbytes:=file.RawRead(rawbytes,NumBytes) ;read bytes
	file.Close() ;close file
	
	if (nbytes < Minimum) ;recommended 4 minimum for unicode detection
		return 0 ;asume text file, if too short
	
	t:=0, i:=0, bytes:=[] ;Initialize vars
	
	loop % nbytes ;create c-style bytes array
		bytes[(A_Index-1)]:=Numget(&rawbytes,(A_Index-1),"UChar")
	
	;determine BOM if possible/existant
	if (bytes[0]=0xFE && bytes[1]=0xFF)
		|| (bytes[0]=0xFF && bytes[1]=0xFE)
		return 0 ;text Utf-16 BE/LE file
	if (bytes[0]=0xEF && bytes[1]=0xBB && bytes[2]=0xBF)
		return 0 ;text Utf-8 file
	if (bytes[0]=0x00 && bytes[1]=0x00
		&& bytes[2]=0xFE && bytes[3]=0xFF)
		|| (bytes[0]=0xFF && bytes[1]=0xFE
		&& bytes[2]=0x00 && bytes[3]=0x00)
		return 0 ;text Utf-32 BE/LE file
		
	while(i<nbytes) {	
		;// ASCII
		if( bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0D
			|| (0x20 <= bytes[i] && bytes[i] <= 0x7E) ) {
			i += 1
			continue
		}
		;// non-overlong 2-byte
		if( (0xC2 <= bytes[i] && bytes[i] <= 0xDF)
			&& (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF) ) {
			i += 2
			continue
		}
		;// excluding overlongs, straight 3-byte, excluding surrogates
		if( ( bytes[i] == 0xE0 && (0xA0 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) )
			|| ( ((0xE1 <= bytes[i] && bytes[i] <= 0xEC)
			|| bytes[i] == 0xEE || bytes[i] == 0xEF)
			&& (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) 	)
			|| ( bytes[i] == 0xED && (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) ) ) {
			i += 3
			continue
		}
		;// planes 1-3, planes 4-15, plane 16
		if( ( bytes[i] == 0xF0 && (0x90 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
			&& (0x80 <= bytes[i+3] && bytes[i+3] <= 0xBF) )
			|| ( (0xF1 <= bytes[i] && bytes[i] <= 0xF3)
			&& (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
			&& (0x80 <= bytes[i+3] && bytes[i+3] <= 0xBF) )
			|| ( bytes[i] == 0xF4 && (0x80 <= bytes[i+1] && bytes[i+1] <= 0x8F)
			&& (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
			&& (0x80 <= bytes[i+3] && bytes[i+3] <= 0xBF) ) ) {
			i += 4
			continue
		}
		t:=1
		break
	}
	
	if (t=0) ;the while-loop has no fails, then confirmed utf-8
		return 0
	;else do nothing and check again with the classic method below
	
	loop, %nbytes% {
		if (bytes[(A_Index-1)]<9) or (bytes[(A_Index-1)]>126)
			or ((bytes[(A_Index-1)]<32) and (bytes[(A_Index-1)]>13))
			return 1
	}
	
	return 0
}

Note: ASCII Extended char-set support has not been added yet.

cheers!

Posted: **12 Oct 2013, 15:15**

Holy Smokes Joe ... your a freakin genius!!! WELL DONE SIR!

Posted: **12 Oct 2013, 19:23**

Thanks

AutoHotkey Community

How to detect file is binary or ascii?

Re: How to detect file is binary or ascii?

Re: How to detect file is binary or ascii?

Re: How to detect file is binary or ascii?