UTF-8 Validation for REALbasic
· Mar 6, 12:14 PMIn response to a question and ensuing discussion on the REALbasic NUG, I wrote a function to validate UTF-8 text as defined by RFC 3629.
Note that the function takes a BinaryStream parameter. To validate a String, create a new BinaryStream object using the String and pass it to the function.
Function IsValidUTF8(bs as BinaryStream) As Boolean
//tests the BinaryStream, starting from the current position, for UTF-8 compliance as defined in RFC 3629.
#pragma disableBackgroundTasks
//first, check for a BOM
dim startPosition as UInt64 = bs.Position
dim EF as UInt8 = bs.ReadUInt8
dim BB as UInt8 = bs.ReadUInt8
dim BF as UInt8 = bs.ReadUInt8
if EF = &hef and BB = &hbb and BF = &hbf then
//we've found a UTF-8 BOM
else
//reset stream
bs.Position = startPosition
end if
do
dim b1 as UInt8 = bs.ReadUInt8
if b1 <= &h7f then //it's the first byte of a one-byte sequence
if bs.EOF then
return true
end if
elseIf (&hc2<= b1 and b1 <= &hdf) then //first byte of a two-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h80<= b2 and b2 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&he0 = b1) then //first byte of a three-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&ha0<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&he1 <= b1 and b1 <= &hec) then //first byte of a three-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h80<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&hed = b1) then //first byte of a three-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h80<= b2 and b2 <= &h9f) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&hee <= b1 and b1 <= &hef) then //first byte of a three-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h80<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&hf0 = b1) then //first byte of a four-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim b4 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h90<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and (&h80<= b4 and b4 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&hf0<= b1 and b1 <= &hf3) then //first byte of a four-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim b4 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h80<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and (&h80<= b4 and b4 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
elseIf (&hf4 = b1) then //first byte of a four-byte sequence
dim b2 as UInt8 = bs.ReadUInt8
dim b3 as UInt8 = bs.ReadUInt8
dim b4 as UInt8 = bs.ReadUInt8
dim lookahead as UInt8 = bs.ReadUInt8
if (&h80<= b2 and b2 <= &h8f) and (&h80<= b3 and b3 <= &hbf) and (&h80<= b4 and b4 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
if bs.EOF then
return true
else
bs.Position = bs.Position - 1
end if
else
return false
end if
else
return false
end if
loop
return true
End Function
Commenting is closed for this article.