UTF-8 Validation for REALbasic

In response to a question and ensuing discussion on the REALbasic NUG, I wrote a function to validate UTF-8 text as defined by RFC 3629.

Note that the function takes a BinaryStream parameter. To validate a String, create a new BinaryStream object using the String and pass it to the function.

Function IsValidUTF8(bs as BinaryStream) As Boolean
  //tests the BinaryStream, starting from the current position, for UTF-8 compliance as defined in RFC 3629.  

  #pragma disableBackgroundTasks

  //first, check for a BOM

  dim startPosition as UInt64 = bs.Position

  dim EF as UInt8 = bs.ReadUInt8
  dim BB as UInt8 = bs.ReadUInt8
  dim BF as UInt8 = bs.ReadUInt8
  if EF = &hef and BB = &hbb and BF = &hbf then
    //we''ve found a UTF-8 BOM
  else
    //reset stream
    bs.Position = startPosition
  end if


  do
    dim b1 as UInt8 = bs.ReadUInt8
    if b1 <= &h7f then //it''s the first byte of a one-byte sequence
      if bs.EOF then
        return true
      end if

    elseIf (&hc2<= b1 and b1 <= &hdf) then //first byte of a two-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h80<= b2 and b2 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&he0 = b1) then //first byte of a three-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&ha0<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&he1 <= b1 and b1 <= &hec) then //first byte of a three-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h80<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&hed = b1) then //first byte of a three-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h80<= b2 and b2 <= &h9f) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&hee <= b1 and b1 <= &hef) then //first byte of a three-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h80<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&hf0 = b1) then //first byte of a four-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim b4 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h90<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and (&h80<= b4 and b4 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&hf0<= b1 and b1 <= &hf3) then //first byte of a four-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim b4 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h80<= b2 and b2 <= &hbf) and (&h80<= b3 and b3 <= &hbf) and (&h80<= b4 and b4 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    elseIf (&hf4 = b1) then //first byte of a four-byte sequence
      dim b2 as UInt8 = bs.ReadUInt8
      dim b3 as UInt8 = bs.ReadUInt8
      dim b4 as UInt8 = bs.ReadUInt8
      dim lookahead as UInt8 = bs.ReadUInt8
      if (&h80<= b2 and b2 <= &h8f) and (&h80<= b3 and b3 <= &hbf) and (&h80<= b4 and b4 <= &hbf) and not (&h80<= lookahead and lookahead <= &hbf) then
        if bs.EOF then
          return true
        else
          bs.Position = bs.Position - 1
        end if
      else
        return false
      end if

    else
      return false
    end if
  loop

  return true
End Function