diff options
Diffstat (limited to 'perl_checker.src/lexer.mll')
-rw-r--r-- | perl_checker.src/lexer.mll | 423 |
1 files changed, 423 insertions, 0 deletions
diff --git a/perl_checker.src/lexer.mll b/perl_checker.src/lexer.mll new file mode 100644 index 0000000..73c40bb --- /dev/null +++ b/perl_checker.src/lexer.mll @@ -0,0 +1,423 @@ +{ (* -*- caml -*- *) +open Parser +open Common +open Lexing +open Info + + +let next_rule = ref None + +let here_docs = Queue.create() +let current_here_doc_mark = ref "" + +let delimit_char = ref '/' +let not_ok_for_match = ref (-1) +let string_nestness = ref 0 + +let building_current_string = ref "" +let current_string_start_pos = ref 0 + +let ins_with_offset offset t lexbuf = + building_current_string := ""; current_string_start_pos := lexeme_start lexbuf + offset; + t lexbuf ; + !building_current_string, (!current_file, !current_string_start_pos, lexeme_end lexbuf) +let ins t lexbuf = ins_with_offset 0 t lexbuf +let ins_to_string t lexbuf = + let s, pos = ins t lexbuf in + not_ok_for_match := lexeme_end lexbuf; + STRING(s, pos) + +let next_s s t lexbuf = + building_current_string := !building_current_string ^ s ; + t lexbuf +let next t lexbuf = next_s (lexeme lexbuf) t lexbuf + +let pos lexbuf = !current_file, lexeme_start lexbuf, lexeme_end lexbuf + +let pos2sfull_with start end_ = Info.pos2sfull (!current_file, start, end_) + +let pos2sfull lexbuf = pos2sfull_with (lexeme_start lexbuf) (lexeme_end lexbuf) + +let die lexbuf err = failwith (pos2sfull_with !current_string_start_pos (lexeme_end lexbuf) ^ err) + +let rec lexbuf2list t lexbuf = + let rec f () = + match t lexbuf with + | EOF -> [] + | e -> e :: f() + in + let l = f() in + l + +let ident_type_from_char fq name lexbuf c = + not_ok_for_match := lexeme_end lexbuf; + match c with + | '$' -> SCALAR_IDENT(fq, name, pos lexbuf) + | '@' -> ARRAY_IDENT (fq, name, pos lexbuf) + | '%' -> HASH_IDENT (fq, name, pos lexbuf) + | '&' -> FUNC_IDENT (fq, name, pos lexbuf) + | '*' -> STAR_IDENT (fq, name, pos lexbuf) + | _ -> internal_error "ident_type_from_char" + +let ident_from_lexbuf lexbuf = + let fq, name = split_at_two_colons (lexeme lexbuf) in + RAW_IDENT(Some fq, name, pos lexbuf) + +let typed_ident_from_lexbuf lexbuf = + let s = lexeme lexbuf in + ident_type_from_char None (skip_n_char 1 s) lexbuf s.[0] + +let typed_fqident_from_lexbuf lexbuf = + let s = lexeme lexbuf in + let fq, name = split_at_two_colons (skip_n_char 1 s) in + ident_type_from_char (Some fq) name lexbuf s.[0] + +let arraylen_ident_from_lexbuf lexbuf = + not_ok_for_match := lexeme_end lexbuf; + let s = lexeme lexbuf in + ARRAYLEN_IDENT(None, skip_n_char 2 s, pos lexbuf) + +let arraylen_fqident_from_lexbuf lexbuf = + let s = lexeme lexbuf in + let fq, name = split_at_two_colons (skip_n_char 2 s) in + ARRAYLEN_IDENT(Some fq, name, pos lexbuf) + +} + +let space = [' ' '\t'] +let stash = [ '$' '@' '%' '&' '*' ] +let ident_start = ['a'-'z' 'A'-'Z' '_'] +let ident = ident_start ['0'-'9' 'A'-'Z' 'a'-'z' '_'] * +let pattern_separator = [ '/' '!' ',' '|' ] + +rule token = parse +| space+ { + (* propagate not_ok_for_match when it was set by the previous token *) + if lexeme_start lexbuf = !not_ok_for_match then not_ok_for_match := lexeme_end lexbuf; + (*SPACE(pos lexbuf) *) token lexbuf + } +| '#' [^ '\n']* { (*COMMENT(lexeme lexbuf, pos lexbuf)*) token lexbuf } + +| "\n=" { + let s, pos = ins_with_offset 1 pod_command lexbuf in POD(s, pos) } + +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + (try + let (interpolate, mark, r) = Queue.pop here_docs in + current_here_doc_mark := mark ; + r := ins (if interpolate then here_doc else raw_here_doc) lexbuf + with Queue.Empty -> ()); + token lexbuf + } +| "->" { ARROW } +| "++" { INCR } +| "--" { DECR } +| "**" { POWER } +| "!" { TIGHT_NOT } +| "~" { BIT_NEG } +| "=~" { PATTERN_MATCH } +| "!~" { PATTERN_MATCH_NOT } +| "*" { MULT } +| "%" { MODULO } +| "x" { REPLICATE } +| "+" { PLUS } +| "-" { MINUS } +| "." { CONCAT } +| "<<" { BIT_SHIFT_LEFT } +| ">>" { BIT_SHIFT_RIGHT } +| "<" | ">" | "<=" | ">=" | "lt" | "gt" | "le" | "ge" { COMPARE_OP(lexeme lexbuf) } +| "==" | "!=" | "<=>" | "eq" | "ne" | "cmp" { EQ_OP(lexeme lexbuf) } +| "&" { BIT_AND } +| "|" { BIT_OR } +| "^" { BIT_XOR } +| "&&" { AND_TIGHT } +| "||" { OR_TIGHT } +| ".." { DOTDOT } +| "..." { DOTDOTDOT } +| "?" { QUESTION_MARK } +| ":" { COLON } +| "::" { PKG_SCOPE } + +| "=" | "+=" | "-=" | "*=" | "/=" | ".=" | "|=" | "&=" | "^=" | "||=" | "&&=" { ASSIGN(lexeme lexbuf) } + +| "," { COMMA } +| "=>" { RIGHT_ARROW } +| "not" { NOT } +| "and" { AND } +| "or" { OR } +| "xor" { XOR } + +| "if" { IF } +| "unless" { UNLESS } +| "do" { DO } +| "while" { WHILE } +| "until" { UNTIL } +| "foreach" { FOR("foreach") } +| "for" { FOR("for") } +| "my" { MY } +| "local" { LOCAL } +| "continue" { CONTINUE } +| "sub" { SUB } +| "format" { FORMAT } +| "package" { PACKAGE } +| "use" { USE } +| "print" { PRINT(pos lexbuf) } +| "new" { NEW(pos lexbuf) } + +| '@' { AT } +| '$' { DOLLAR } +| '%' { PERCENT } +| '&' { AMPERSAND } +| '*' { STAR } +| "$#" { ARRAYLEN } + + +| ';' { SEMI_COLON } +| '(' { PAREN } +| '{' { BRACKET } +| '[' { ARRAYREF } +| ')' { not_ok_for_match := lexeme_end lexbuf; PAREN_END } +| '}' { not_ok_for_match := lexeme_end lexbuf; BRACKET_END } +| ']' { not_ok_for_match := lexeme_end lexbuf; ARRAYREF_END } + +| '(' [ '$' '@' '\\' '&' ';' ]+ ')' { + (* bloody prototypes, must be caught especially otherwise "($)" is badly tokenized *) + PROTOTYPE(lexeme lexbuf, pos lexbuf) + } + +| "/" { + if lexeme_start lexbuf = !not_ok_for_match then DIVISION + else ( + delimit_char := '/' ; + let s, pos = ins delimited_string lexbuf in + let opts, _ = ins pattern_options lexbuf in + PATTERN(s, opts, pos) + ) + } + +| "m" pattern_separator { + delimit_char := lexeme_char lexbuf 1 ; + let s, pos = ins delimited_string lexbuf in + let opts, _ = ins pattern_options lexbuf in + PATTERN(s, opts, pos) +} + +| "qr" pattern_separator { + delimit_char := lexeme_char lexbuf 2 ; + let s, pos = ins delimited_string lexbuf in + let opts, _ = ins pattern_options lexbuf in + PATTERN(s, opts, pos) +} + +| "s" pattern_separator { + delimit_char := lexeme_char lexbuf 1 ; + let s1, (_, start, _) = ins delimited_string lexbuf in + let s2, (_, _, end_) = ins delimited_string lexbuf in + let opts, _ = ins pattern_options lexbuf in + PATTERN_SUBST(s1, s2, opts, (!current_file, start, end_)) +} + +| "tr" pattern_separator { + delimit_char := lexeme_char lexbuf 2 ; + let s1, (_, start, _) = ins delimited_string lexbuf in + let s2, (_, _, end_) = ins delimited_string lexbuf in + let opts, _ = ins pattern_options lexbuf in + PATTERN_SUBST(s1, s2, opts, (!current_file, start, end_)) +} + +| "<<" ident { + let here_doc_ref = ref("", bpos) in + Queue.push (true, skip_n_char 2 (lexeme lexbuf), here_doc_ref) here_docs ; + HERE_DOC here_doc_ref + } +| "<<'" ident "'" { + not_ok_for_match := lexeme_end lexbuf; + let here_doc_ref = ref("", bpos) in + Queue.push (false, skip_n_char_ 3 1 (lexeme lexbuf), here_doc_ref) here_docs ; + HERE_DOC here_doc_ref + } +| "<<" space+ "'" +| "<<" space+ ident { + failwith (pos2sfull_with (lexeme_start lexbuf + 2) (lexeme_end lexbuf) ^ "No space allowed between \"<<\" and the marker") + } +| "<<" space* '"' { + failwith (pos2sfull_with (lexeme_start lexbuf + 2) (lexeme_end lexbuf) ^ "Don't use <<\"MARK\", use <<MARK instead") + } + +| "\\" stash +| "\\" ['0'-'9' 'A'-'Z' 'a'-'z'] +| "\\" space* '(' + { lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1; REF } + +| "$#" ident? ("::" ident)+ { arraylen_fqident_from_lexbuf lexbuf } +| "$#" ident { arraylen_ident_from_lexbuf lexbuf } + +| stash ident? ("::" ident)+ { typed_fqident_from_lexbuf lexbuf } +| stash ident +| stash '^'? [^ '{' ' ' '\t' '\n'] { typed_ident_from_lexbuf lexbuf } + +| ident? ("::" ident)+ { ident_from_lexbuf lexbuf } +| ident { BAREWORD(lexeme lexbuf, pos lexbuf) } + +| ident ":" { LABEL(lexeme lexbuf, pos lexbuf) } + +| ['0'-'9'] ['0'-'9' '_']* ('.' ['0'-'9'] ['0'-'9' '_']*)+ +| 'v' ['0'-'9'] ['0'-'9' '_']* ('.' ['0'-'9'] ['0'-'9' '_']*)* + { + not_ok_for_match := lexeme_end lexbuf; + REVISION(lexeme lexbuf, pos lexbuf) + } + +| ['0'-'9']* '.' ['0'-'9']+ (['e' 'E']['-' '+']?['0'-'9']+)? { + not_ok_for_match := lexeme_end lexbuf; + NUM(lexeme lexbuf, pos lexbuf) + } +| ['0'-'9'] ['0'-'9' '_']* { + not_ok_for_match := lexeme_end lexbuf; + NUM(lexeme lexbuf, pos lexbuf) + } +| "0x" ['0'-'9' 'a'-'f' 'A'-'F']+ { + not_ok_for_match := lexeme_end lexbuf; + NUM(lexeme lexbuf, pos lexbuf) + } + +| '"' { ins_to_string string lexbuf } +| "'" { ins_to_string rawstring lexbuf } +| '`' { delimit_char := '`'; + not_ok_for_match := lexeme_end lexbuf; + let s, pos = ins delimited_string lexbuf in COMMAND_STRING(s, pos) } +| "q(" { ins_to_string qstring lexbuf } +| "qq(" { ins_to_string qqstring lexbuf } +| "qw(" { let s, pos = ins qstring lexbuf in QUOTEWORDS(s, pos) } + +| eof { EOF } +| _ { failwith (Printf.sprintf "%serror tokenizing <<%s>>" (pos2sfull lexbuf) (lexeme lexbuf)) } + +and string = parse + '"' { () } +| '\\' { next_rule := Some string ; string_escape lexbuf } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next string lexbuf + } +| [^ '\n' '\\' '"']+ { next string lexbuf } +| eof { die lexbuf "Unterminated_string" } + +and delimited_string = parse +| '\\' { next_rule := Some delimited_string ; string_escape lexbuf } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next delimited_string lexbuf + } +| eof { die lexbuf "Unterminated_delimited_string" } +| [ ^ '\\' '\n' ] { if lexeme_char lexbuf 0 <> !delimit_char then next delimited_string lexbuf } + +and rawstring = parse + ''' { () } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next rawstring lexbuf + } +| "\\'" +| [^ '\n' ''']+ { next rawstring lexbuf } +| eof { die lexbuf "Unterminated_rawstring" } + +and qqstring = parse + ')' { + if !string_nestness <> 0 then (decr string_nestness; next qqstring lexbuf) + } +| '(' { + incr string_nestness; + next qqstring lexbuf + } +| '\\' { next_rule := Some qqstring ; string_escape lexbuf } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next qqstring lexbuf + } +| [^ '\n' '(' ')' '\\']+ { next qqstring lexbuf } +| eof { die lexbuf "Unterminated_qqstring" } + +and qstring = parse +| ')' { + if !string_nestness <> 0 then (decr string_nestness ; next qstring lexbuf) + } +| '(' { + incr string_nestness; + next qstring lexbuf + } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next qstring lexbuf + } +| [^ '\n' '(' ')']+ { next qstring lexbuf } +| eof { die lexbuf "Unterminated_qstring" } + +and here_doc = parse +| '\\' { next_rule := Some here_doc ; string_escape lexbuf } +| [ ^ '\n' '\\' ]* { + let s = lexeme lexbuf in + if chomps s <> !current_here_doc_mark + then next_s s here_doc lexbuf + else if s <> !current_here_doc_mark then Printf.eprintf "%sTrailing spaces after HERE-document mark\n" (pos2sfull lexbuf) + } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next here_doc lexbuf + } +| eof { die lexbuf "Unterminated_here_doc" } + +and raw_here_doc = parse +| [ ^ '\n' ]* { + let s = lexeme lexbuf in + if chomps s <> !current_here_doc_mark + then next_s s raw_here_doc lexbuf + else if s <> !current_here_doc_mark then Printf.eprintf "%sTrailing spaces after HERE-document mark\n" (pos2sfull lexbuf) + } +| '\n' { + lpush current_file_lines_starts (lexeme_end lexbuf); + next raw_here_doc lexbuf + } +| eof { die lexbuf "Unterminated_raw_here_doc" } + + +and string_escape = parse +| '0' { next_s "\000" (some !next_rule) lexbuf } +| '"' { next_s "\"" (some !next_rule) lexbuf } +| ''' { next_s "'" (some !next_rule) lexbuf } +| 'n' { next_s "\n" (some !next_rule) lexbuf } +| 't' { next_s "\t" (some !next_rule) lexbuf } +| 'x' _ _ { + try + let s = String.make 1 (Char.chr (int_of_string ("0" ^ lexeme lexbuf))) in + next_s s (some !next_rule) lexbuf + with Failure("int_of_string") -> die lexbuf ("Bad_hex_in_string \"" ^ lexeme lexbuf ^ "\"") + } +| _ { next_s ("\\" ^ lexeme lexbuf) (some !next_rule) lexbuf } + +and pattern_options = parse +| [ 'g' 'i' 'm' 'o' 's' 'x' 'e' ] { next pattern_options lexbuf } +| _ { lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1; () } + +and pod_command = parse +| [^ '\n' ]+ { + let s = lexeme lexbuf in + if String.contains s '\t' then failwith(pos2sfull lexbuf ^ "tabulations not accepted in POD commands") else + let command = String.sub s 0 (try String.index s ' ' with Not_found -> String.length s) in + match command with + | "cut" -> + if !building_current_string = "" then + failwith(pos2sfull lexbuf ^ "found POD command \"=cut\" but it is not a POD block") + | "head1" | "head2" | "head3" | "head4" | "over" | "item" | "back" | "pod" | "begin" | "end" | "for" -> + next pod lexbuf + | s -> failwith(pos2sfull lexbuf ^ "unknown POD command \"" ^ s ^ "\"") + } +| _ { failwith(pos2sfull lexbuf ^ "POD command expected") } + +and pod = parse +| "\n=" { next pod_command lexbuf } +| "\n" [^ '=' '\n'] [^ '\n']* +| "\n" { next pod lexbuf } +| eof +| _ { failwith(pos2sfull_with !current_string_start_pos (lexeme_end lexbuf) ^ "POD block still open") } |