open Ulexing open Htmlparser open Printf open Globals let c b = (lasttok := utf8_lexeme b; col := !col + lexeme_length b) let nl () = lasttok := "\\n"; ln:=!ln+1; col := 0 let subs s i = String.sub s i ((String.length s)-i) let regexp tagname = ['a'-'z' 'A'-'Z' '-' '0'-'9'] let regexp whitespace = ['\t' ' '] let regexp newline = "\r\n" | ['\n' '\r'] let intag = ref false let rec main lexbuf = if !intag then tag lexbuf else token lexbuf and token = lexer | " c lexbuf; intag := true; TAG_SLASH_OPEN | "<" -> c lexbuf; intag := true; TAG_OPEN | " c lexbuf; intag:=true; DOCTYPE | newline -> nl (); token lexbuf | eof -> EOF | _ -> c lexbuf; STRING(utf8_lexeme lexbuf) and tag = lexer | whitespace* -> c lexbuf; tag lexbuf | newline -> nl (); tag lexbuf | ">" -> c lexbuf; intag := false; TAG_CLOSE | "/" -> c lexbuf; SLASH | "=" -> c lexbuf; EQUAL | tagname+ -> c lexbuf; STRING(utf8_lexeme lexbuf) | "\"" -> c lexbuf; STRING(string_scanner lexbuf) | eof -> lerr "Unclosed tag" and string_scanner = lexer | "\\\"" -> c lexbuf; "\""^(string_scanner lexbuf) | "\"" -> "" | eof -> lerr "Unclosed string literal" | newline -> lerr "Line break in quoted string" | _ -> let s = utf8_lexeme lexbuf in c lexbuf; s^(string_scanner lexbuf)