# -*- tcl -*-
# Lexing C
package provide clex 1.0
namespace eval clex {}
# Most tokens can be recognized and separated from the others via
# [string map]. The remainder are identifiers. Except for strings,
# and comments. These may contain any other token, and we may not act
# upon them. And both may contain sequences reminiscent of the
# other. Because of that a first step is to identify and parse out
# comments and strings, where 'parsing out' means that these tokens
# are replaced with special char sequences which refer the lexer to a
# small database. In the final output they are placed back.
proc clex::SCextract {code mapvar} {
# Extract strings and comments from the code and place them in mapvar.
# Replace them with ids
upvar $mapvar map
set tmp ""
set count 0
while {1} {
set comment [string first "/*" $code]
set string [string first "\"" $code]
if {($comment < 0) && ($string < 0)} {
# No comments nor strings to extract anymore.
break
}
# Debug out
#puts "$string | $comment | [string length $code]"
if {
(($string >= 0) && ($comment >= 0) && ($string < $comment)) ||
(($string >= 0) && ($comment < 0))
} {
# The next vari-sized thing is a "-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \" as one character inside of the string.
set from $string
while 1 {
incr from
set stop [string first "\"" $code $from]
set stopb [string first "\\\"" $code $from]
incr stopb
if {$stop == $stopb} {set from $stopb ; incr from ; continue}
break
}
set id \000@$count\000
incr count
lappend map $id [set sub [string range $code $string $stop]]
incr stop ; incr string -1
append tmp [string range $code 0 $string] $id
set code [string range $code $stop end]
# Debug out
#puts "\tSTR $id <$sub>"
continue
}
if {
(($string >= 0) && ($comment >= 0) && ($comment < $string)) ||
(($comment >= 0) && ($string < 0))
} {
# The next vari-sized thing is a comment.
# We ignore comments.
set stop [string first "*/" $code $comment]
incr stop 2
incr comment -1
append tmp [string range $code 0 $comment]
set code [string range $code $stop end]
continue
}
return -code error "Panic, string and comment at some location"
}
append tmp $code
return $tmp
}
proc clex::DefStart {} {
variable tokens [list]
#puts "== <$tokens>"
return
}
proc clex::Key {string {replacement {}}} {
variable tokens
if {$replacement == {}} {
set replacement \000\001[string toupper $string]\000
} else {
set replacement \000\001$replacement\000
}
lappend tokens $string $replacement
#puts "== <$tokens>"
return
}
proc clex::DefEnd {} {
variable tokens
array set tmp $tokens
set res [list]
foreach key [lsort -decreasing [array names tmp]] {
lappend res $key $tmp($key)
}
set tokens $res
#puts "== <$tokens>"
return
}
proc clex::lex {code} {
variable tokens
# Phase I ... Extract strings and comments so that they don't interfere
# with the remaining phases.
# Phase II ... Separate all constant-sized tokens (keywords and
# punctuation) from each other.
# Phase III ... Separate whitespace from the useful text.
# Actually converts whitespace into separator characters.
# Phase IV ... Reinsert extracted tokens and cleanup multi-separator sequences
set scmap [list]
if 0 {
# Minimal number of commands for all phases
regsub -all -- "\[\t\n \]+" [string map $tokens \
[SCextract $code scmap]] \
\000 tmp
set code [split \
[string trim \
[string map "\000\000\000 \000 \000\000 \000" \
[string map $scmap \
$tmp]] \000] \000]
}
if 1 {
# Each phase spelled out explicitly ...
set code [SCextract $code scmap] ; # I
set code [string map $tokens $code] ; # II
regsub -all -- "\[\t\n \]+" $code \000 code ; # III
set code [string map $scmap $code] ; # IV/a
set code [string map "\000\000\000 \000 \000\000 \000" $code] ; # IV/b
set code [string trim $code \000]
set code [split $code \000]
}
# Run through the list and create something useable by the parser.
#
# A list of pairs (pairs being lists of 2 elements), where each
# pair contains the symbol to give to the parser, and associated
# data, if any.
set tmp [list]
foreach lex $code {
switch -glob -- [string index $lex 0] {
\001 {
# Keyword, no data.
lappend tmp [list [string range $lex 1 end] {}]
}
' - [0-9] {
# Character or numeric constant.
lappend tmp [list CONSTANT $lex]
}
\" {
# String literal. Strip the double-quotes.
lappend tmp [list STRING_LITERAL [string range $lex 1 end-1]]
}
default {
# Identifier. This code does not distinguish
# identifiers and type-names yet. This is defered to
# the 'scanner', i.e. the glue code feeding the lexer
# symbols into the parser.
lappend tmp [list IDENTIFIER $lex]
}
}
}
set code $tmp
return $code
}
namespace eval clex {
DefStart
Key ( LPAREN ; Key ) RPAREN ; Key -> DEREF
Key < LT ; Key <= LE ; Key == EQ
Key > GT ; Key >= GE ; Key != NE
Key \[ LBRACKET ; Key \] RBRACKET ; Key = ASSIGN
Key \{ LBRACE ; Key \} RBRACE ; Key *= MUL_ASSIGN
Key . DOT ; Key , COMMA ; Key /= DIV_ASSIGN
Key ++ INCR_OP ; Key -- DECR_OP ; Key %= REM_ASSIGN
Key & ADDR_BITAND ; Key * MULT_STAR ; Key += PLUS_ASSIGN
Key + PLUS ; Key - MINUS ; Key -= MINUS_ASSIGN
Key ~ BITNOT ; Key ! LOGNOT ; Key <<= LSHIFT_ASSIGN
Key / DIV ; Key % REM ; Key >>= RSHIFT_ASSIGN
Key << LSHIFT ; Key >> RSHIFT ; Key &= BITAND_ASSIGN
Key ^ BITEOR ; Key && LOGAND ; Key ^= BITEOR_ASSIGN
Key | BITOR ; Key || LOGOR ; Key |= BITOR_ASSIGN
Key ? QUERY ; Key : COLON ; Key \; SEMICOLON
Key ... ELLIPSIS
Key typedef ; Key extern ; Key static ; Key auto ; Key register
Key void ; Key char ; Key short ; Key int ; Key long
Key float ; Key double ; Key signed ; Key unsigned
Key goto ; Key continue ; Key break ; Key return
Key case ; Key default ; Key switch
Key struct ; Key union ; Key enum
Key while ; Key do ; Key for
Key const ; Key volatile
Key if ; Key else
Key sizeof
DefEnd
}