contrib/luasrcdiet/lua/optlex.lua

   1 --[[--------------------------------------------------------------------
   2
   3   optlex.lua: does lexer-based optimizations
   4   This file is part of LuaSrcDiet.
   5
   6   Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
   7   The COPYRIGHT file describes the conditions
   8   under which this software may be distributed.
   9
  10   See the ChangeLog for more information.
  11
  12 ----------------------------------------------------------------------]]
  13
  14 --[[--------------------------------------------------------------------
  15 -- NOTES:
  16 -- * For more lexer-based optimization ideas, see the TODO items or
  17 --   look at technotes.txt.
  18 -- * TODO: general string delimiter conversion optimizer
  19 -- * TODO: (numbers) warn if overly significant digit
  20 ----------------------------------------------------------------------]]
  21
  22 local base = _G
  23 local string = require "string"
  24 module "optlex"
  25 local match = string.match
  26 local sub = string.sub
  27 local find = string.find
  28 local rep = string.rep
  29 local print
  30
  31 ------------------------------------------------------------------------
  32 -- variables and data structures
  33 ------------------------------------------------------------------------
  34
  35 -- error function, can override by setting own function into module
  36 error = base.error
  37
  38 warn = {}                       -- table for warning flags
  39
  40 local stoks, sinfos, stoklns    -- source lists
  41
  42 local is_realtoken = {          -- significant (grammar) tokens
  43   TK_KEYWORD = true,
  44   TK_NAME = true,
  45   TK_NUMBER = true,
  46   TK_STRING = true,
  47   TK_LSTRING = true,
  48   TK_OP = true,
  49   TK_EOS = true,
  50 }
  51 local is_faketoken = {          -- whitespace (non-grammar) tokens
  52   TK_COMMENT = true,
  53   TK_LCOMMENT = true,
  54   TK_EOL = true,
  55   TK_SPACE = true,
  56 }
  57
  58 local opt_details               -- for extra information
  59
  60 ------------------------------------------------------------------------
  61 -- true if current token is at the start of a line
  62 -- * skips over deleted tokens via recursion
  63 ------------------------------------------------------------------------
  64
  65 local function atlinestart(i)
  66   local tok = stoks[i - 1]
  67   if i <= 1 or tok == "TK_EOL" then
  68     return true
  69   elseif tok == "" then
  70     return atlinestart(i - 1)
  71   end
  72   return false
  73 end
  74
  75 ------------------------------------------------------------------------
  76 -- true if current token is at the end of a line
  77 -- * skips over deleted tokens via recursion
  78 ------------------------------------------------------------------------
  79
  80 local function atlineend(i)
  81   local tok = stoks[i + 1]
  82   if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then
  83     return true
  84   elseif tok == "" then
  85     return atlineend(i + 1)
  86   end
  87   return false
  88 end
  89
  90 ------------------------------------------------------------------------
  91 -- counts comment EOLs inside a long comment
  92 -- * in order to keep line numbering, EOLs need to be reinserted
  93 ------------------------------------------------------------------------
  94
  95 local function commenteols(lcomment)
  96   local sep = #match(lcomment, "^%-%-%[=*%[")
  97   local z = sub(lcomment, sep + 1, -(sep - 1))  -- remove delims
  98   local i, c = 1, 0
  99   while true do
 100     local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i)
 101     if not p then break end     -- if no matches, done
 102     i = p + 1
 103     c = c + 1
 104     if #s > 0 and r ~= s then   -- skip CRLF or LFCR
 105       i = i + 1
 106     end
 107   end
 108   return c
 109 end
 110
 111 ------------------------------------------------------------------------
 112 -- compares two tokens (i, j) and returns the whitespace required
 113 -- * important! see technotes.txt for more information
 114 -- * only two grammar/real tokens are being considered
 115 -- * if "", no separation is needed
 116 -- * if " ", then at least one whitespace (or EOL) is required
 117 ------------------------------------------------------------------------
 118
 119 local function checkpair(i, j)
 120   local match = match
 121   local t1, t2 = stoks[i], stoks[j]
 122   --------------------------------------------------------------------
 123   if t1 == "TK_STRING" or t1 == "TK_LSTRING" or
 124      t2 == "TK_STRING" or t2 == "TK_LSTRING" then
 125     return ""
 126   --------------------------------------------------------------------
 127   elseif t1 == "TK_OP" or t2 == "TK_OP" then
 128     if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or
 129        (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then
 130       return ""
 131     end
 132     if t1 == "TK_OP" and t2 == "TK_OP" then
 133       -- for TK_OP/TK_OP pairs, see notes in technotes.txt
 134       local op, op2 = sinfos[i], sinfos[j]
 135       if (match(op, "^%.%.?$") and match(op2, "^%.")) or
 136          (match(op, "^[~=<>]$") and op2 == "=") or
 137          (op == "[" and (op2 == "[" or op2 == "=")) then
 138         return " "
 139       end
 140       return ""
 141     end
 142     -- "TK_OP" + "TK_NUMBER" case
 143     local op = sinfos[i]
 144     if t2 == "TK_OP" then op = sinfos[j] end
 145     if match(op, "^%.%.?%.?$") then
 146       return " "
 147     end
 148     return ""
 149   --------------------------------------------------------------------
 150   else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then
 151     return " "
 152   --------------------------------------------------------------------
 153   end
 154 end
 155
 156 ------------------------------------------------------------------------
 157 -- repack tokens, removing deletions caused by optimization process
 158 ------------------------------------------------------------------------
 159
 160 local function repack_tokens()
 161   local dtoks, dinfos, dtoklns = {}, {}, {}
 162   local j = 1
 163   for i = 1, #stoks do
 164     local tok = stoks[i]
 165     if tok ~= "" then
 166       dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i]
 167       j = j + 1
 168     end
 169   end
 170   stoks, sinfos, stoklns = dtoks, dinfos, dtoklns
 171 end
 172
 173 ------------------------------------------------------------------------
 174 -- number optimization
 175 -- * optimization using string formatting functions is one way of doing
 176 --   this, but here, we consider all cases and handle them separately
 177 --   (possibly an idiotic approach...)
 178 -- * scientific notation being generated is not in canonical form, this
 179 --   may or may not be a bad thing, feedback welcome
 180 -- * note: intermediate portions need to fit into a normal number range
 181 -- * optimizations can be divided based on number patterns:
 182 -- * hexadecimal:
 183 --   (1) no need to remove leading zeros, just skip to (2)
 184 --   (2) convert to integer if size equal or smaller
 185 --       * change if equal size -> lose the 'x' to reduce entropy
 186 --   (3) number is then processed as an integer
 187 --   (4) note: does not make 0[xX] consistent
 188 -- * integer:
 189 --   (1) note: includes anything with trailing ".", ".0", ...
 190 --   (2) remove useless fractional part, if present, e.g. 123.000
 191 --   (3) remove leading zeros, e.g. 000123
 192 --   (4) switch to scientific if shorter, e.g. 123000 -> 123e3
 193 -- * with fraction:
 194 --   (1) split into digits dot digits
 195 --   (2) if no integer portion, take as zero (can omit later)
 196 --   (3) handle degenerate .000 case, after which the fractional part
 197 --       must be non-zero (if zero, it's matched as an integer)
 198 --   (4) remove trailing zeros for fractional portion
 199 --   (5) p.q where p > 0 and q > 0 cannot be shortened any more
 200 --   (6) otherwise p == 0 and the form is .q, e.g. .000123
 201 --   (7) if scientific shorter, convert, e.g. .000123 -> 123e-6
 202 -- * scientific:
 203 --   (1) split into (digits dot digits) [eE] ([+-] digits)
 204 --   (2) if significand has ".", shift it out so it becomes an integer
 205 --   (3) if significand is zero, just use zero
 206 --   (4) remove leading zeros for significand
 207 --   (5) shift out trailing zeros for significand
 208 --   (6) examine exponent and determine which format is best:
 209 --       integer, with fraction, scientific
 210 ------------------------------------------------------------------------
 211
 212 local function do_number(i)
 213   local before = sinfos[i]      -- 'before'
 214   local z = before              -- working representation
 215   local y                       -- 'after', if better
 216   --------------------------------------------------------------------
 217   if match(z, "^0[xX]") then            -- hexadecimal number
 218     local v = base.tostring(base.tonumber(z))
 219     if #v <= #z then
 220       z = v  -- change to integer, AND continue
 221     else
 222       return  -- no change; stick to hex
 223     end
 224   end
 225   --------------------------------------------------------------------
 226   if match(z, "^%d+%.?0*$") then        -- integer or has useless frac
 227     z = match(z, "^(%d+)%.?0*$")  -- int portion only
 228     if z + 0 > 0 then
 229       z = match(z, "^0*([1-9]%d*)$")  -- remove leading zeros
 230       local v = #match(z, "0*$")
 231       local nv = base.tostring(v)
 232       if v > #nv + 1 then  -- scientific is shorter
 233         z = sub(z, 1, #z - v).."e"..nv
 234       end
 235       y = z
 236     else
 237       y = "0"  -- basic zero
 238     end
 239   --------------------------------------------------------------------
 240   elseif not match(z, "[eE]") then      -- number with fraction part
 241     local p, q = match(z, "^(%d*)%.(%d+)$")  -- split
 242     if p == "" then p = 0 end  -- int part zero
 243     if q + 0 == 0 and p == 0 then
 244       y = "0"  -- degenerate .000 case
 245     else
 246       -- now, q > 0 holds and p is a number
 247       local v = #match(q, "0*$")  -- remove trailing zeros
 248       if v > 0 then
 249         q = sub(q, 1, #q - v)
 250       end
 251       -- if p > 0, nothing else we can do to simplify p.q case
 252       if p + 0 > 0 then
 253         y = p.."."..q
 254       else
 255         y = "."..q  -- tentative, e.g. .000123
 256         local v = #match(q, "^0*")  -- # leading spaces
 257         local w = #q - v            -- # significant digits
 258         local nv = base.tostring(#q)
 259         -- e.g. compare 123e-6 versus .000123
 260         if w + 2 + #nv < 1 + #q then
 261           y = sub(q, -w).."e-"..nv
 262         end
 263       end
 264     end
 265   --------------------------------------------------------------------
 266   else                                  -- scientific number
 267     local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$")
 268     ex = base.tonumber(ex)
 269     -- if got ".", shift out fractional portion of significand
 270     local p, q = match(sig, "^(%d*)%.(%d*)$")
 271     if p then
 272       ex = ex - #q
 273       sig = p..q
 274     end
 275     if sig + 0 == 0 then
 276       y = "0"  -- basic zero
 277     else
 278       local v = #match(sig, "^0*")  -- remove leading zeros
 279       sig = sub(sig, v + 1)
 280       v = #match(sig, "0*$") -- shift out trailing zeros
 281       if v > 0 then
 282         sig = sub(sig, 1, #sig - v)
 283         ex = ex + v
 284       end
 285       -- examine exponent and determine which format is best
 286       local nex = base.tostring(ex)
 287       if ex == 0 then  -- it's just an integer
 288         y = sig
 289       elseif ex > 0 and (ex <= 1 + #nex) then  -- a number
 290         y = sig..rep("0", ex)
 291       elseif ex < 0 and (ex >= -#sig) then  -- fraction, e.g. .123
 292         v = #sig + ex
 293         y = sub(sig, 1, v).."."..sub(sig, v + 1)
 294       elseif ex < 0 and (#nex >= -ex - #sig) then
 295         -- e.g. compare 1234e-5 versus .01234
 296         -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig
 297         --     -> #nex >= -ex - #sig
 298         v = -ex - #sig
 299         y = "."..rep("0", v)..sig
 300       else  -- non-canonical scientific representation
 301         y = sig.."e"..ex
 302       end
 303     end--if sig
 304   end
 305   --------------------------------------------------------------------
 306   if y and y ~= sinfos[i] then
 307     if opt_details then
 308       print("<number> (line "..stoklns[i]..") "..sinfos[i].." -> "..y)
 309       opt_details = opt_details + 1
 310     end
 311     sinfos[i] = y
 312   end
 313 end
 314
 315 ------------------------------------------------------------------------
 316 -- string optimization
 317 -- * note: works on well-formed strings only!
 318 -- * optimizations on characters can be summarized as follows:
 319 --   \a\b\f\n\r\t\v -- no change
 320 --   \\ -- no change
 321 --   \"\' -- depends on delim, other can remove \
 322 --   \[\] -- remove \
 323 --   \<char> -- general escape, remove \
 324 --   \<eol> -- normalize the EOL only
 325 --   \ddd -- if \a\b\f\n\r\t\v, change to latter
 326 --           if other < ascii 32, keep ddd but zap leading zeros
 327 --           if >= ascii 32, translate it into the literal, then also
 328 --                           do escapes for \\,\",\' cases
 329 --   <other> -- no change
 330 -- * switch delimiters if string becomes shorter
 331 ------------------------------------------------------------------------
 332
 333 local function do_string(I)
 334   local info = sinfos[I]
 335   local delim = sub(info, 1, 1)                 -- delimiter used
 336   local ndelim = (delim == "'") and '"' or "'"  -- opposite " <-> '
 337   local z = sub(info, 2, -2)                    -- actual string
 338   local i = 1
 339   local c_delim, c_ndelim = 0, 0                -- "/' counts
 340   --------------------------------------------------------------------
 341   while i <= #z do
 342     local c = sub(z, i, i)
 343     ----------------------------------------------------------------
 344     if c == "\\" then                   -- escaped stuff
 345       local j = i + 1
 346       local d = sub(z, j, j)
 347       local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true)
 348       ------------------------------------------------------------
 349       if not p then                     -- \<char> -- remove \
 350         z = sub(z, 1, i - 1)..sub(z, j)
 351         i = i + 1
 352       ------------------------------------------------------------
 353       elseif p <= 8 then                -- \a\b\f\n\r\t\v\\
 354         i = i + 2                       -- no change
 355       ------------------------------------------------------------
 356       elseif p <= 10 then               -- \<eol> -- normalize EOL
 357         local eol = sub(z, j, j + 1)
 358         if eol == "\r\n" or eol == "\n\r" then
 359           z = sub(z, 1, i).."\n"..sub(z, j + 2)
 360         elseif p == 10 then  -- \r case
 361           z = sub(z, 1, i).."\n"..sub(z, j + 1)
 362         end
 363         i = i + 2
 364       ------------------------------------------------------------
 365       elseif p <= 12 then               -- \"\' -- remove \ for ndelim
 366         if d == delim then
 367           c_delim = c_delim + 1
 368           i = i + 2
 369         else
 370           c_ndelim = c_ndelim + 1
 371           z = sub(z, 1, i - 1)..sub(z, j)
 372           i = i + 1
 373         end
 374       ------------------------------------------------------------
 375       else                              -- \ddd -- various steps
 376         local s = match(z, "^(%d%d?%d?)", j)
 377         j = i + 1 + #s                  -- skip to location
 378         local cv = s + 0
 379         local cc = string.char(cv)
 380         local p = find("\a\b\f\n\r\t\v", cc, 1, true)
 381         if p then                       -- special escapes
 382           s = "\\"..sub("abfnrtv", p, p)
 383         elseif cv < 32 then             -- normalized \ddd
 384           s = "\\"..cv
 385         elseif cc == delim then         -- \<delim>
 386           s = "\\"..cc
 387           c_delim = c_delim + 1
 388         elseif cc == "\\" then          -- \\
 389           s = "\\\\"
 390         else                            -- literal character
 391           s = cc
 392           if cc == ndelim then
 393             c_ndelim = c_ndelim + 1
 394           end
 395         end
 396         z = sub(z, 1, i - 1)..s..sub(z, j)
 397         i = i + #s
 398       ------------------------------------------------------------
 399       end--if p
 400     ----------------------------------------------------------------
 401     else-- c ~= "\\"                    -- <other> -- no change
 402       i = i + 1
 403       if c == ndelim then  -- count ndelim, for switching delimiters
 404         c_ndelim = c_ndelim + 1
 405       end
 406     ----------------------------------------------------------------
 407     end--if c
 408   end--while
 409   --------------------------------------------------------------------
 410   -- switching delimiters, a long-winded derivation:
 411   -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes
 412   -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes
 413   -- simplifying the condition (1)>(2) --> c_delim > c_ndelim
 414   if c_delim > c_ndelim then
 415     i = 1
 416     while i <= #z do
 417       local p, q, r = find(z, "([\'\"])", i)
 418       if not p then break end
 419       if r == delim then                -- \<delim> -> <delim>
 420         z = sub(z, 1, p - 2)..sub(z, p)
 421         i = p
 422       else-- r == ndelim                -- <ndelim> -> \<ndelim>
 423         z = sub(z, 1, p - 1).."\\"..sub(z, p)
 424         i = p + 2
 425       end
 426     end--while
 427     delim = ndelim  -- actually change delimiters
 428   end
 429   --------------------------------------------------------------------
 430   z = delim..z..delim
 431   if z ~= sinfos[I] then
 432     if opt_details then
 433       print("<string> (line "..stoklns[I]..") "..sinfos[I].." -> "..z)
 434       opt_details = opt_details + 1
 435     end
 436     sinfos[I] = z
 437   end
 438 end
 439
 440 ------------------------------------------------------------------------
 441 -- long string optimization
 442 -- * note: warning flagged if trailing whitespace found, not trimmed
 443 -- * remove first optional newline
 444 -- * normalize embedded newlines
 445 -- * reduce '=' separators in delimiters if possible
 446 ------------------------------------------------------------------------
 447
 448 local function do_lstring(I)
 449   local info = sinfos[I]
 450   local delim1 = match(info, "^%[=*%[")  -- cut out delimiters
 451   local sep = #delim1
 452   local delim2 = sub(info, -sep, -1)
 453   local z = sub(info, sep + 1, -(sep + 1))  -- lstring without delims
 454   local y = ""
 455   local i = 1
 456   --------------------------------------------------------------------
 457   while true do
 458     local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i)
 459     -- deal with a single line
 460     local ln
 461     if not p then
 462       ln = sub(z, i)
 463     elseif p >= i then
 464       ln = sub(z, i, p - 1)
 465     end
 466     if ln ~= "" then
 467       -- flag a warning if there are trailing spaces, won't optimize!
 468       if match(ln, "%s+$") then
 469         warn.lstring = "trailing whitespace in long string near line "..stoklns[I]
 470       end
 471       y = y..ln
 472     end
 473     if not p then  -- done if no more EOLs
 474       break
 475     end
 476     -- deal with line endings, normalize them
 477     i = p + 1
 478     if p then
 479       if #s > 0 and r ~= s then  -- skip CRLF or LFCR
 480         i = i + 1
 481       end
 482       -- skip first newline, which can be safely deleted
 483       if not(i == 1 and i == p) then
 484         y = y.."\n"
 485       end
 486     end
 487   end--while
 488   --------------------------------------------------------------------
 489   -- handle possible deletion of one or more '=' separators
 490   if sep >= 3 then
 491     local chk, okay = sep - 1
 492     -- loop to test ending delimiter with less of '=' down to zero
 493     while chk >= 2 do
 494       local delim = "%]"..rep("=", chk - 2).."%]"
 495       if not match(y, delim) then okay = chk end
 496       chk = chk - 1
 497     end
 498     if okay then  -- change delimiters
 499       sep = rep("=", okay - 2)
 500       delim1, delim2 = "["..sep.."[", "]"..sep.."]"
 501     end
 502   end
 503   --------------------------------------------------------------------
 504   sinfos[I] = delim1..y..delim2
 505 end
 506
 507 ------------------------------------------------------------------------
 508 -- long comment optimization
 509 -- * note: does not remove first optional newline
 510 -- * trim trailing whitespace
 511 -- * normalize embedded newlines
 512 -- * reduce '=' separators in delimiters if possible
 513 ------------------------------------------------------------------------
 514
 515 local function do_lcomment(I)
 516   local info = sinfos[I]
 517   local delim1 = match(info, "^%-%-%[=*%[")  -- cut out delimiters
 518   local sep = #delim1
 519   local delim2 = sub(info, -sep, -1)
 520   local z = sub(info, sep + 1, -(sep - 1))  -- comment without delims
 521   local y = ""
 522   local i = 1
 523   --------------------------------------------------------------------
 524   while true do
 525     local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i)
 526     -- deal with a single line, extract and check trailing whitespace
 527     local ln
 528     if not p then
 529       ln = sub(z, i)
 530     elseif p >= i then
 531       ln = sub(z, i, p - 1)
 532     end
 533     if ln ~= "" then
 534       -- trim trailing whitespace if non-empty line
 535       local ws = match(ln, "%s*$")
 536       if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end
 537       y = y..ln
 538     end
 539     if not p then  -- done if no more EOLs
 540       break
 541     end
 542     -- deal with line endings, normalize them
 543     i = p + 1
 544     if p then
 545       if #s > 0 and r ~= s then  -- skip CRLF or LFCR
 546         i = i + 1
 547       end
 548       y = y.."\n"
 549     end
 550   end--while
 551   --------------------------------------------------------------------
 552   -- handle possible deletion of one or more '=' separators
 553   sep = sep - 2
 554   if sep >= 3 then
 555     local chk, okay = sep - 1
 556     -- loop to test ending delimiter with less of '=' down to zero
 557     while chk >= 2 do
 558       local delim = "%]"..rep("=", chk - 2).."%]"
 559       if not match(y, delim) then okay = chk end
 560       chk = chk - 1
 561     end
 562     if okay then  -- change delimiters
 563       sep = rep("=", okay - 2)
 564       delim1, delim2 = "--["..sep.."[", "]"..sep.."]"
 565     end
 566   end
 567   --------------------------------------------------------------------
 568   sinfos[I] = delim1..y..delim2
 569 end
 570
 571 ------------------------------------------------------------------------
 572 -- short comment optimization
 573 -- * trim trailing whitespace
 574 ------------------------------------------------------------------------
 575
 576 local function do_comment(i)
 577   local info = sinfos[i]
 578   local ws = match(info, "%s*$")        -- just look from end of string
 579   if #ws > 0 then
 580     info = sub(info, 1, -(ws + 1))      -- trim trailing whitespace
 581   end
 582   sinfos[i] = info
 583 end
 584
 585 ------------------------------------------------------------------------
 586 -- returns true if string found in long comment
 587 -- * this is a feature to keep copyright or license texts
 588 ------------------------------------------------------------------------
 589
 590 local function keep_lcomment(opt_keep, info)
 591   if not opt_keep then return false end  -- option not set
 592   local delim1 = match(info, "^%-%-%[=*%[")  -- cut out delimiters
 593   local sep = #delim1
 594   local delim2 = sub(info, -sep, -1)
 595   local z = sub(info, sep + 1, -(sep - 1))  -- comment without delims
 596   if find(z, opt_keep, 1, true) then  -- try to match
 597     return true
 598   end
 599 end
 600
 601 ------------------------------------------------------------------------
 602 -- main entry point
 603 -- * currently, lexer processing has 2 passes
 604 -- * processing is done on a line-oriented basis, which is easier to
 605 --   grok due to the next point...
 606 -- * since there are various options that can be enabled or disabled,
 607 --   processing is a little messy or convoluted
 608 ------------------------------------------------------------------------
 609
 610 function optimize(option, toklist, semlist, toklnlist)
 611   --------------------------------------------------------------------
 612   -- set option flags
 613   --------------------------------------------------------------------
 614   local opt_comments = option["opt-comments"]
 615   local opt_whitespace = option["opt-whitespace"]
 616   local opt_emptylines = option["opt-emptylines"]
 617   local opt_eols = option["opt-eols"]
 618   local opt_strings = option["opt-strings"]
 619   local opt_numbers = option["opt-numbers"]
 620   local opt_keep = option.KEEP
 621   opt_details = option.DETAILS and 0  -- upvalues for details display
 622   print = print or base.print
 623   if opt_eols then  -- forced settings, otherwise won't work properly
 624     opt_comments = true
 625     opt_whitespace = true
 626     opt_emptylines = true
 627   end
 628   --------------------------------------------------------------------
 629   -- variable initialization
 630   --------------------------------------------------------------------
 631   stoks, sinfos, stoklns                -- set source lists
 632     = toklist, semlist, toklnlist
 633   local i = 1                           -- token position
 634   local tok, info                       -- current token
 635   local prev    -- position of last grammar token
 636                 -- on same line (for TK_SPACE stuff)
 637   --------------------------------------------------------------------
 638   -- changes a token, info pair
 639   --------------------------------------------------------------------
 640   local function settoken(tok, info, I)
 641     I = I or i
 642     stoks[I] = tok or ""
 643     sinfos[I] = info or ""
 644   end
 645   --------------------------------------------------------------------
 646   -- processing loop (PASS 1)
 647   --------------------------------------------------------------------
 648   while true do
 649     tok, info = stoks[i], sinfos[i]
 650     ----------------------------------------------------------------
 651     local atstart = atlinestart(i)      -- set line begin flag
 652     if atstart then prev = nil end
 653     ----------------------------------------------------------------
 654     if tok == "TK_EOS" then             -- end of stream/pass
 655       break
 656     ----------------------------------------------------------------
 657     elseif tok == "TK_KEYWORD" or       -- keywords, identifiers,
 658            tok == "TK_NAME" or          -- operators
 659            tok == "TK_OP" then
 660       -- TK_KEYWORD and TK_OP can't be optimized without a big
 661       -- optimization framework; it would be more of an optimizing
 662       -- compiler, not a source code compressor
 663       -- TK_NAME that are locals needs parser to analyze/optimize
 664       prev = i
 665     ----------------------------------------------------------------
 666     elseif tok == "TK_NUMBER" then      -- numbers
 667       if opt_numbers then
 668         do_number(i)  -- optimize
 669       end
 670       prev = i
 671     ----------------------------------------------------------------
 672     elseif tok == "TK_STRING" or        -- strings, long strings
 673            tok == "TK_LSTRING" then
 674       if opt_strings then
 675         if tok == "TK_STRING" then
 676           do_string(i)  -- optimize
 677         else
 678           do_lstring(i)  -- optimize
 679         end
 680       end
 681       prev = i
 682     ----------------------------------------------------------------
 683     elseif tok == "TK_COMMENT" then     -- short comments
 684       if opt_comments then
 685         if i == 1 and sub(info, 1, 1) == "#" then
 686           -- keep shbang comment, trim whitespace
 687           do_comment(i)
 688         else
 689           -- safe to delete, as a TK_EOL (or TK_EOS) always follows
 690           settoken()  -- remove entirely
 691         end
 692       elseif opt_whitespace then        -- trim whitespace only
 693         do_comment(i)
 694       end
 695     ----------------------------------------------------------------
 696     elseif tok == "TK_LCOMMENT" then    -- long comments
 697       if keep_lcomment(opt_keep, info) then
 698         ------------------------------------------------------------
 699         -- if --keep, we keep a long comment if <msg> is found;
 700         -- this is a feature to keep copyright or license texts
 701         if opt_whitespace then          -- trim whitespace only
 702           do_lcomment(i)
 703         end
 704         prev = i
 705       elseif opt_comments then
 706         local eols = commenteols(info)
 707         ------------------------------------------------------------
 708         -- prepare opt_emptylines case first, if a disposable token
 709         -- follows, current one is safe to dump, else keep a space;
 710         -- it is implied that the operation is safe for '-', because
 711         -- current is a TK_LCOMMENT, and must be separate from a '-'
 712         if is_faketoken[stoks[i + 1]] then
 713           settoken()  -- remove entirely
 714           tok = ""
 715         else
 716           settoken("TK_SPACE", " ")
 717         end
 718         ------------------------------------------------------------
 719         -- if there are embedded EOLs to keep and opt_emptylines is
 720         -- disabled, then switch the token into one or more EOLs
 721         if not opt_emptylines and eols > 0 then
 722           settoken("TK_EOL", rep("\n", eols))
 723         end
 724         ------------------------------------------------------------
 725         -- if optimizing whitespaces, force reinterpretation of the
 726         -- token to give a chance for the space to be optimized away
 727         if opt_whitespace and tok ~= "" then
 728           i = i - 1  -- to reinterpret
 729         end
 730         ------------------------------------------------------------
 731       else                              -- disabled case
 732         if opt_whitespace then          -- trim whitespace only
 733           do_lcomment(i)
 734         end
 735         prev = i
 736       end
 737     ----------------------------------------------------------------
 738     elseif tok == "TK_EOL" then         -- line endings
 739       if atstart and opt_emptylines then
 740         settoken()  -- remove entirely
 741       elseif info == "\r\n" or info == "\n\r" then
 742         -- normalize the rest of the EOLs for CRLF/LFCR only
 743         -- (note that TK_LCOMMENT can change into several EOLs)
 744         settoken("TK_EOL", "\n")
 745       end
 746     ----------------------------------------------------------------
 747     elseif tok == "TK_SPACE" then       -- whitespace
 748       if opt_whitespace then
 749         if atstart or atlineend(i) then
 750           -- delete leading and trailing whitespace
 751           settoken()  -- remove entirely
 752         else
 753           ------------------------------------------------------------
 754           -- at this point, since leading whitespace have been removed,
 755           -- there should be a either a real token or a TK_LCOMMENT
 756           -- prior to hitting this whitespace; the TK_LCOMMENT case
 757           -- only happens if opt_comments is disabled; so prev ~= nil
 758           local ptok = stoks[prev]
 759           if ptok == "TK_LCOMMENT" then
 760             -- previous TK_LCOMMENT can abut with anything
 761             settoken()  -- remove entirely
 762           else
 763             -- prev must be a grammar token; consecutive TK_SPACE
 764             -- tokens is impossible when optimizing whitespace
 765             local ntok = stoks[i + 1]
 766             if is_faketoken[ntok] then
 767               -- handle special case where a '-' cannot abut with
 768               -- either a short comment or a long comment
 769               if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and
 770                  ptok == "TK_OP" and sinfos[prev] == "-" then
 771                 -- keep token
 772               else
 773                 settoken()  -- remove entirely
 774               end
 775             else--is_realtoken
 776               -- check a pair of grammar tokens, if can abut, then
 777               -- delete space token entirely, otherwise keep one space
 778               local s = checkpair(prev, i + 1)
 779               if s == "" then
 780                 settoken()  -- remove entirely
 781               else
 782                 settoken("TK_SPACE", " ")
 783               end
 784             end
 785           end
 786           ------------------------------------------------------------
 787         end
 788       end
 789     ----------------------------------------------------------------
 790     else
 791       error("unidentified token encountered")
 792     end
 793     ----------------------------------------------------------------
 794     i = i + 1
 795   end--while
 796   repack_tokens()
 797   --------------------------------------------------------------------
 798   -- processing loop (PASS 2)
 799   --------------------------------------------------------------------
 800   if opt_eols then
 801     i = 1
 802     -- aggressive EOL removal only works with most non-grammar tokens
 803     -- optimized away because it is a rather simple scheme -- basically
 804     -- it just checks 'real' token pairs around EOLs
 805     if stoks[1] == "TK_COMMENT" then
 806       -- first comment still existing must be shbang, skip whole line
 807       i = 3
 808     end
 809     while true do
 810       tok, info = stoks[i], sinfos[i]
 811       --------------------------------------------------------------
 812       if tok == "TK_EOS" then           -- end of stream/pass
 813         break
 814       --------------------------------------------------------------
 815       elseif tok == "TK_EOL" then       -- consider each TK_EOL
 816         local t1, t2 = stoks[i - 1], stoks[i + 1]
 817         if is_realtoken[t1] and is_realtoken[t2] then  -- sanity check
 818           local s = checkpair(i - 1, i + 1)
 819           if s == "" then
 820             settoken()  -- remove entirely
 821           end
 822         end
 823       end--if tok
 824       --------------------------------------------------------------
 825       i = i + 1
 826     end--while
 827     repack_tokens()
 828   end
 829   --------------------------------------------------------------------
 830   if opt_details and opt_details > 0 then print() end -- spacing
 831   return stoks, sinfos, stoklns
 832 end